[Snippets] Some optimizations (#12384)

- Implement static TileScheduler to handle compile params processing. Now compile params are accessed only here
- TileScheduler should emit code only for necessary scalar/vector Tiles
- Perform abstract-to-physical register mapping in one place (currently KernelEmitter constructor)
- Implement more precise register mapping, so larger subgraphs could be created (now up to 12 i/o regs instead of 7)

Co-authored-by: Ivan Novoselov <ivan.novoselov@intel.com>
This commit is contained in:
Alexandra Sidorova 2022-08-30 18:26:43 +04:00 committed by GitHub
parent fc27a6b49f
commit 69c514563c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
95 changed files with 4401 additions and 1107 deletions

View File

@ -51,5 +51,7 @@ public:
virtual ~Emitter() = default;
};
using AllocatedEmitter = std::pair<std::shared_ptr<Emitter>, ngraph::snippets::RegInfo>;
} // namespace snippets
} // namespace ngraph

View File

@ -18,7 +18,7 @@ auto getRegisters(std::shared_ptr<ngraph::Node>& n) -> ngraph::snippets::RegInfo
/**
* @interface TargetMachine
* @brief Base class Target machine representation. Target derives from this class to provide generator information about supported emittors
* @brief Base class Target machine representation. Target derives from this class to provide generator information about supported emitters
* @ingroup snippets
*/
class TargetMachine {
@ -41,9 +41,10 @@ public:
*/
virtual size_t get_lanes() const = 0;
/**
* @brief called by generator to all the emittor for a target machine
* @return a map by node's type info with callbacks to create an instance of emmitter for corresponding operation type
* @brief called by generator to all the emitter for a target machine
* @return a map by node's type info with callbacks to create an instance of emitter for corresponding operation type
*/
std::function<std::shared_ptr<Emitter>(std::shared_ptr<ngraph::Node>)> get(const ngraph::DiscreteTypeInfo type) const {
auto jitter = jitters.find(type);
@ -118,6 +119,12 @@ public:
*/
code generate(std::shared_ptr<ov::Model>& m, const void* compile_params = nullptr) const;
/**
* @brief gets target machine
* @return pointer to constant target machine
*/
std::shared_ptr<const TargetMachine> get_target_machine() const { return target; }
protected:
std::shared_ptr<TargetMachine> target;
};

View File

@ -1,34 +0,0 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/op/op.hpp>
#include "load.hpp"
namespace ngraph {
namespace snippets {
namespace op {
/**
* @interface BlockedLoad
* @brief Generated by Canonicalization step for blocked data (NCHW<X>c) to be loaded
* @ingroup snippets
*/
class BlockedLoad : public Load {
public:
OPENVINO_OP("BlockedLoad", "SnippetsOpset", ngraph::snippets::op::Load);
BlockedLoad(const Output<Node>& x);
BlockedLoad() = default;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override {
check_new_args_count(this, new_args);
return std::make_shared<BlockedLoad>(new_args.at(0));
}
};
} // namespace op
} // namespace snippets
} // namespace ngraph

View File

@ -1,36 +0,0 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/op/op.hpp>
#include <ngraph/op/parameter.hpp>
namespace ngraph {
namespace snippets {
namespace op {
/**
* @interface BlockedParameter
* @brief Represents blocked input (NCHW<X>c) for a subgraph
* @ingroup snippets
*/
class BlockedParameter : public ngraph::op::Parameter {
public:
OPENVINO_OP("BlockedParameter", "SnippetsOpset", ngraph::op::Parameter);
BlockedParameter() = default;
BlockedParameter(const ngraph::element::Type& element_type, const PartialShape& pshape)
: Parameter(element_type, pshape) {
}
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override {
check_new_args_count(this, new_args);
return std::make_shared<BlockedParameter>(m_element_type, m_partial_shape);
}
};
} // namespace op
} // namespace snippets
} // namespace ngraph

View File

@ -0,0 +1,38 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <openvino/op/convert.hpp>
#include "ngraph/op/op.hpp"
namespace ngraph {
namespace snippets {
namespace op {
/**
* @interface ConvertSaturation
* @brief It's a ordinary Convert op with specific rules for integer conversion.
* The implementation uses "saturation" conversion for integer values.
* It means that if the integer values are outside the limits
* of the maximum and minimum values of the destination data type, they are clamped.
* For example, int_32t ---> int8_t
* 129 ---> 127
* @ingroup snippets
*/
class ConvertSaturation : public ov::op::v0::Convert {
public:
OPENVINO_OP("ConvertSaturation", "SnippetsOpset", ov::op::v0::Convert);
ConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type);
ConvertSaturation() = default;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
bool has_evaluate() const override { return false; }
};
} // namespace op
} // namespace snippets
} // namespace ngraph

View File

@ -0,0 +1,37 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <openvino/op/convert.hpp>
#include "ngraph/op/op.hpp"
namespace ngraph {
namespace snippets {
namespace op {
/**
* @interface ConvertTruncation
* @brief It's a ordinary Convert op with specific rules for integer conversion.
* The implementation "truncation" conversion for integer values.
* It means that if there are overflow, the integer values will wrap around.
* For example, int_32t ---> int8_t
* 129 ---> -127
* @ingroup snippets
*/
class ConvertTruncation : public ov::op::v0::Convert {
public:
OPENVINO_OP("ConvertTruncation", "SnippetsOpset", ov::op::v0::Convert);
ConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type);
ConvertTruncation() = default;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
bool has_evaluate() const override { return false; }
};
} // namespace op
} // namespace snippets
} // namespace ngraph

View File

@ -12,20 +12,22 @@ namespace op {
/**
* @interface Load
* @brief Generated by Canonicalization step where explicit load instruction should be emmiteed
* ScalarLoad == scalar instruction + post increment
* Load (VectorLoad) == vector instruction + post increment
* BroadcastLoad == scalar instruction - post increment
* BlockedLoad == vector instruction - post increment
* @brief Generated by Canonicalization step where explicit instructions should be emitted for data loading
* where number of elements to load is determined by "count"
* Default value is "1" - to load one element
* @ingroup snippets
*/
class Load : public ngraph::op::Op {
public:
OPENVINO_OP("Load", "SnippetsOpset");
Load(const Output<Node>& x);
Load(const Output<Node>& x, const size_t count = 1lu);
Load() = default;
size_t get_count() const { return m_count; }
void set_count(const size_t count) { m_count = count; }
bool visit_attributes(AttributeVisitor& visitor) override;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
@ -35,6 +37,9 @@ public:
OPENVINO_SUPPRESS_DEPRECATED_START
bool evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const override;
OPENVINO_SUPPRESS_DEPRECATED_END
protected:
size_t m_count = 0lu;
};
} // namespace op

View File

@ -1,34 +0,0 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/op/op.hpp>
#include "load.hpp"
namespace ngraph {
namespace snippets {
namespace op {
/**
* @interface ScalarLoad
* @brief Generated by Canonicalization for a scalar value load to vector register
* @ingroup snippets
*/
class ScalarLoad : public Load {
public:
OPENVINO_OP("ScalarLoad", "SnippetsOpset", ngraph::snippets::op::Load);
ScalarLoad(const Output<Node>& x);
ScalarLoad() = default;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override {
check_new_args_count(this, new_args);
return std::make_shared<ScalarLoad>(new_args.at(0));
}
};
} // namespace op
} // namespace snippets
} // namespace ngraph

View File

@ -1,34 +0,0 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/op/op.hpp>
#include "store.hpp"
namespace ngraph {
namespace snippets {
namespace op {
/**
* @interface ScalarStore
* @brief Generated by Canonicalization for a scalar value store from vector register
* @ingroup snippets
*/
class ScalarStore : public Store {
public:
OPENVINO_OP("ScalarStore", "SnippetsOpset", ngraph::snippets::op::Store);
ScalarStore(const Output<Node>& x);
ScalarStore() = default;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override {
check_new_args_count(this, new_args);
return std::make_shared<ScalarStore>(new_args.at(0));
}
};
} // namespace op
} // namespace snippets
} // namespace ngraph

View File

@ -11,17 +11,23 @@ namespace snippets {
namespace op {
/**
* @interface Load
* @brief Generated by Canonicalization step where explicit store instruction should be emmiteed
* @interface Store
* @brief Generated by Canonicalization step where explicit instructions should be emitted for data storing
* where number of elements to store is determined by "count"
* Default value is "1" - to store one element
* @ingroup snippets
*/
class Store : public ngraph::op::Op {
public:
OPENVINO_OP("Store", "SnippetsOpset");
Store(const Output<Node>& x);
Store(const Output<Node>& x, const size_t count = 1lu);
Store() = default;
size_t get_count() const { return m_count; }
void set_count(const size_t count) { m_count = count; }
bool visit_attributes(AttributeVisitor& visitor) override;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
@ -31,6 +37,9 @@ public:
OPENVINO_SUPPRESS_DEPRECATED_START
bool evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const override;
OPENVINO_SUPPRESS_DEPRECATED_END
protected:
size_t m_count = 0lu;
};
} // namespace op

View File

@ -89,10 +89,9 @@ public:
}
snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes,
ngraph::pass::Manager& opt, const void* compile_params = nullptr);
snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes,
snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes, ngraph::pass::Manager& opt,
const void* compile_params = nullptr);
snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes, const void* compile_params = nullptr);
snippets::Schedule generate(ngraph::pass::Manager &opt, const void* compile_params = nullptr);
snippets::Schedule generate(const void* compile_params = nullptr);
Shape canonicalize(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes);
@ -107,8 +106,10 @@ public:
void serialize() const;
static auto wrap_node_as_subgraph(const std::shared_ptr<ngraph::Node>& node) -> std::shared_ptr<Subgraph>;
static void fill_empty_output_names(const Output<Node>& target_output_node, const Output<Node>& replacement_output_node);
private:
void align_element_types(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes);
void convert_to_snippet_dialect();
Shape exec_domain;
std::shared_ptr<ov::Model> m_body;

View File

@ -20,14 +20,27 @@ class Tile : public ngraph::op::Op {
public:
OPENVINO_OP("Tile", "SnippetsOpset");
Tile(const std::vector<std::pair<std::shared_ptr<ngraph::snippets::Emitter>, ngraph::snippets::RegInfo>>& region);
/// \brief Construct an Tile
/// \param region The vector of pairs: emitters and the corresponding registers
/// \param increment Tile size - count of elements to load and store.
/// Vector Tile should have size of vector register and Scalar Tile should have 1
/// \param num_inputs Count of inputs
/// \param num_outputs Count of outputs
/// \param io_dims Vector of last dimensions of inputs and outputs
/// \param io_data_sizes Vector of data type sizes of inputs and outputs
Tile(const std::vector<AllocatedEmitter>& region, size_t increment, size_t num_inputs, size_t num_outputs,
const std::vector<size_t>& io_dims, const std::vector<size_t>& io_data_sizes);
Tile() = default;
std::vector<std::pair<std::shared_ptr<ngraph::snippets::Emitter>, ngraph::snippets::RegInfo>> region;
std::vector<AllocatedEmitter> region;
size_t increment = 0;
size_t num_inputs = 0;
size_t num_outputs = 0;
std::vector<size_t> io_dims {};
std::vector<size_t> io_data_size {};
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override {
return std::make_shared<Tile>(region);
return std::make_shared<Tile>(region, increment, num_inputs, num_outputs, io_dims, io_data_size);
}
const void *compile_params;
};
} // namespace op

View File

@ -0,0 +1,39 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "ngraph/op/op.hpp"
#include "snippets/emitter.hpp"
#include "tile.hpp"
namespace ngraph {
namespace snippets {
namespace op {
/**
* @interface TileScheduler
* @brief Contains a set of Tiles (currently one vector and one scalar) and performs necessary preparations
* before the Tiles could be executed: calculates offsets, sets proper work amounts, decrement pointers if the same data
* have to be read several times (broadcasting).
* @ingroup snippets
*/
class TileScheduler : public ngraph::op::Op {
public:
OPENVINO_OP("TileScheduler", "SnippetsOpset");
TileScheduler(const AllocatedEmitter& vector_region, const AllocatedEmitter& scalar_region);
TileScheduler() = default;
AllocatedEmitter vector_region;
AllocatedEmitter scalar_region;
// todo: this clone_with_new_inputs is irrelevant
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override {
return std::make_shared<TileScheduler>(vector_region, scalar_region);
}
const void *compile_params;
};
} // namespace op
} // namespace snippets
} // namespace ngraph

View File

@ -1,34 +0,0 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/op/op.hpp>
#include "load.hpp"
namespace ngraph {
namespace snippets {
namespace op {
/**
* @interface VectorLoad
* @brief Generated by Canonicalization for a vector value load to vector register
* @ingroup snippets
*/
class VectorLoad : public Load {
public:
OPENVINO_OP("VectorLoad", "SnippetsOpset", ngraph::snippets::op::Load);
VectorLoad(const Output<Node>& x);
VectorLoad() = default;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override {
check_new_args_count(this, new_args);
return std::make_shared<VectorLoad>(new_args.at(0));
}
};
} // namespace op
} // namespace snippets
} // namespace ngraph

View File

@ -1,34 +0,0 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/op/op.hpp>
#include "store.hpp"
namespace ngraph {
namespace snippets {
namespace op {
/**
* @interface VectorStore
* @brief Generated by Canonicalization for a vector value store from vector register
* @ingroup snippets
*/
class VectorStore : public Store {
public:
OPENVINO_OP("VectorStore", "SnippetsOpset", ngraph::snippets::op::Store);
VectorStore(const Output<Node>& x);
VectorStore() = default;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override {
check_new_args_count(this, new_args);
return std::make_shared<VectorStore>(new_args.at(0));
}
};
} // namespace op
} // namespace snippets
} // namespace ngraph

View File

@ -18,7 +18,7 @@ namespace pass {
*/
class AssignRegisters : public ngraph::pass::FunctionPass {
public:
AssignRegisters() {
explicit AssignRegisters() {
set_property(ngraph::pass::PassProperty::REQUIRE_STATIC_SHAPE, true);
}
bool run_on_model(const std::shared_ptr<ov::Model>& m) override;

View File

@ -0,0 +1,31 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/pass/graph_rewrite.hpp>
#include <ngraph/pattern/matcher.hpp>
namespace ngraph {
namespace snippets {
namespace pass {
/**
* @interface InsertConvertOnInputs
* @brief Inserts ConvertSaturation op after Parameters and Scalars to convert data type of inputs
* to supported execution data type.
* Note: ConvertSaturation op isn't covered by specification of "Convert" op
* This op is used for conversion into and from FP32 after the correspoding Load
* and before Store to calculate in FP32 inside subgraph body in CPU Plugin
* @ingroup snippets
*/
class InsertConvertOnInputs: public ngraph::pass::MatcherPass {
public:
InsertConvertOnInputs(const ov::element::Type exec_type = ov::element::f32);
};
} // namespace pass
} // namespace snippets
} // namespace ngraph

View File

@ -19,7 +19,7 @@ namespace pass {
*/
class InsertLoad: public ngraph::pass::MatcherPass {
public:
InsertLoad();
InsertLoad(const size_t count = 1lu);
};
/**
@ -30,7 +30,7 @@ public:
*/
class InsertStore: public ngraph::pass::MatcherPass {
public:
InsertStore();
InsertStore(const size_t count = 1lu);
};

View File

@ -0,0 +1,31 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/pass/graph_rewrite.hpp>
#include <ngraph/pattern/matcher.hpp>
namespace ngraph {
namespace snippets {
namespace pass {
/**
* @interface ResetTypeRelaxedNodePrecision
* @brief Reset precision for type relaxed nodes inside body to align precision between nodes.
* Should be called after all Convert insertions
* @ingroup snippets
*/
class ResetTypeRelaxedNodePrecision: public ngraph::pass::FunctionPass {
public:
OPENVINO_RTTI("ResetTypeRelaxedNodePrecision", "0");
ResetTypeRelaxedNodePrecision(const ov::element::Type exec_type = ov::element::f32);
bool run_on_model(const std::shared_ptr<ngraph::Function>& m) override;
private:
ov::element::Type exec_type;
};
} // namespace pass
} // namespace snippets
} // namespace ngraph

View File

@ -0,0 +1,28 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/pass/graph_rewrite.hpp>
#include <ngraph/pattern/matcher.hpp>
namespace ngraph {
namespace snippets {
namespace pass {
/**
* @interface TransofrmConvertToConvertTruncation
* @brief Transform Convert to ConvertTruncation with specification conversion rules
* Note: ConvertTruncation op is covered by specification of "Convert" op
* This op is used for real Convert ops inside subgraph body in CPU Plugin
* @ingroup snippets
*/
class TransformConvertToConvertTruncation: public ngraph::pass::MatcherPass {
public:
TransformConvertToConvertTruncation();
};
} // namespace pass
} // namespace snippets
} // namespace ngraph

View File

@ -12,27 +12,27 @@ namespace snippets {
namespace pass {
/**
* @interface ReplaceLoadsWithScalarLoads
* @brief Replaces vector loads with scalar versions.
* The pass is used to cange element type of function in a canonical form vector to scalar.
* @interface SetScalarCountForLoad
* @brief Set count `1` for Load to represent as ScalarLoad
* The pass is used to change element count to loading to "1" to load scalar value
* Used for tail generation
* @ingroup snippets
*/
class ReplaceLoadsWithScalarLoads: public ngraph::pass::MatcherPass {
class SetScalarCountForLoad: public ngraph::pass::MatcherPass {
public:
ReplaceLoadsWithScalarLoads();
SetScalarCountForLoad();
};
/**
* @interface ReplaceStoresWithScalarStores
* @brief Replaces vector stores with scalar versions.
* The pass is used to cange element type of model in a canonical form vector to scalar.
* @interface SetScalarCountForStore
* @brief Set count `1` for Store to represent as ScalarStore
* The pass is used to change element count to stroring to "1" to store scalar valuw
* Used for tail generation
* @ingroup snippets
*/
class ReplaceStoresWithScalarStores: public ngraph::pass::MatcherPass {
class SetScalarCountForStore: public ngraph::pass::MatcherPass {
public:
ReplaceStoresWithScalarStores();
SetScalarCountForStore();
};
} // namespace pass

View File

@ -7,21 +7,18 @@
#include "ngraph/ops.hpp"
#include <ngraph/opsets/opset1.hpp>
#include "op/blockedload.hpp"
#include "op/blockedparameter.hpp"
#include "op/broadcastload.hpp"
#include "op/broadcastmove.hpp"
#include "op/convert_saturation.hpp"
#include "op/convert_truncation.hpp"
#include "op/kernel.hpp"
#include "op/load.hpp"
#include "op/nop.hpp"
#include "op/scalar.hpp"
#include "op/scalarload.hpp"
#include "op/scalarstore.hpp"
#include "op/powerstatic.hpp"
#include "op/store.hpp"
#include "op/tile.hpp"
#include "op/vectorload.hpp"
#include "op/vectorstore.hpp"
#include "op/tile_scheduler.hpp"
namespace ngraph {
namespace snippets {

View File

@ -11,14 +11,9 @@
// SnippetS dialect
NGRAPH_OP(Load, ngraph::snippets::op)
NGRAPH_OP(ScalarLoad, ngraph::snippets::op)
NGRAPH_OP(VectorLoad, ngraph::snippets::op)
NGRAPH_OP(BlockedLoad, ngraph::snippets::op)
NGRAPH_OP(BroadcastLoad, ngraph::snippets::op)
NGRAPH_OP(Store, ngraph::snippets::op)
NGRAPH_OP(ScalarStore, ngraph::snippets::op)
NGRAPH_OP(VectorStore, ngraph::snippets::op)
NGRAPH_OP(BroadcastMove, ngraph::snippets::op)
NGRAPH_OP(Scalar, ngraph::snippets::op)
@ -29,9 +24,10 @@ NGRAPH_OP(Nop, ngraph::snippets::op)
// opset completeness
NGRAPH_OP(Constant, ngraph::op)
NGRAPH_OP(Parameter, ngraph::op::v0)
NGRAPH_OP(BlockedParameter, ngraph::snippets::op)
NGRAPH_OP(Result, ngraph::op::v0)
NGRAPH_OP(Broadcast, ngraph::op::v1)
NGRAPH_OP(ConvertTruncation, ngraph::snippets::op)
NGRAPH_OP(ConvertSaturation, ngraph::snippets::op)
// unary
NGRAPH_OP(Abs, ngraph::op::v0)

View File

@ -17,7 +17,8 @@ auto ngraph::snippets::getRegisters(std::shared_ptr<ngraph::Node>& n) -> ngraph:
auto rt = n->get_rt_info();
// ToDo: change to reg_t
std::vector<size_t> rout;
std::vector<size_t> rin, rout;
auto it_rt = rt.find("reginfo");
if (it_rt != rt.end()) {
for (auto reg : it_rt->second.as<std::vector<size_t>>()) {
@ -25,12 +26,11 @@ auto ngraph::snippets::getRegisters(std::shared_ptr<ngraph::Node>& n) -> ngraph:
}
}
std::vector<size_t> rin;
for (auto input : n->inputs()) {
for (const auto& input : n->inputs()) {
auto rt = input.get_source_output().get_node_shared_ptr()->get_rt_info();
auto it_rt = rt.find("reginfo");
if (it_rt != rt.end()) {
for (auto reg : it_rt->second.as<std::vector<size_t>>()) {
for (auto& reg : it_rt->second.as<std::vector<size_t>>()) {
rin.push_back(reg);
}
}
@ -48,51 +48,56 @@ ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptr<ov:
auto results = m->get_results();
auto in = params.size();
auto out = results.size();
auto nptrs = in + out;
std::vector<size_t> io_last_dims(in + out);
std::vector<size_t> io_data_sizes(in + out);
std::transform(params.begin(), params.end(), io_last_dims.begin(),
[](const std::shared_ptr<Node>& n){return n->get_output_shape(0).back();});
std::transform(results.begin(), results.end(), io_last_dims.begin() + in,
[](const std::shared_ptr<Node>& n){return n->get_input_shape(0).back();});
std::transform(params.begin(), params.end(), io_data_sizes.begin(),
[](const std::shared_ptr<Node>& n){return n->get_element_type().size();});
std::transform(results.begin(), results.end(), io_data_sizes.begin() + in,
[](const std::shared_ptr<Node>& n){return n->get_element_type().size();});
OV_ITT_TASK_CHAIN(GENERATE, ngraph::pass::itt::domains::SnippetsTransform, "Snippets::Generator", "::VectorTile")
// vector tile
std::vector<std::pair<std::shared_ptr<ngraph::snippets::Emitter>, ngraph::snippets::RegInfo>> lowered;
std::vector<AllocatedEmitter> lowered;
for (auto n : m->get_ordered_ops()) {
lowered.push_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n)));
lowered.emplace_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n)));
}
OV_ITT_TASK_NEXT(GENERATE, "::ScalarTile")
// scalar tile
auto m_scalar = ov::clone_model(*m.get());
ngraph::pass::Manager mng;
mng.register_pass<ngraph::snippets::pass::ReplaceLoadsWithScalarLoads>();
mng.register_pass<ngraph::snippets::pass::ReplaceStoresWithScalarStores>();
mng.register_pass<ngraph::snippets::pass::SetScalarCountForLoad>();
mng.register_pass<ngraph::snippets::pass::SetScalarCountForStore>();
mng.run_passes(m_scalar);
OV_ITT_TASK_NEXT(GENERATE, "::ScalarTile_get")
std::vector<std::pair<std::shared_ptr<Emitter>, RegInfo>> scalar_lowered;
std::vector<AllocatedEmitter> scalar_lowered;
for (auto n : m_scalar->get_ordered_ops()) {
scalar_lowered.push_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n)));
scalar_lowered.emplace_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n)));
}
OV_ITT_TASK_NEXT(GENERATE, "::Tiles1D")
OV_ITT_TASK_NEXT(GENERATE, "::Tiles1D");
// wrapping into tiles1D
std::vector<std::pair<std::shared_ptr<Emitter>, RegInfo>> tiles1D;
auto tile = std::make_shared<ngraph::snippets::op::Tile>(lowered);
tile->compile_params = compile_params;
tiles1D.push_back(std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(tile),
std::make_pair(std::vector<size_t>({target->get_lanes(), 0, nptrs, 1}), std::vector<size_t>{})));
tile = std::make_shared<ngraph::snippets::op::Tile>(scalar_lowered);
tile->compile_params = compile_params;
tiles1D.push_back(std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(tile),
std::make_pair(std::vector<size_t>{{1, target->get_lanes(), nptrs, 1}}, std::vector<size_t>{})));
//todo: in, out, and io_last_dims should derive naturally from the graph representation
const auto& vector_tile = std::make_shared<ngraph::snippets::op::Tile>(lowered, target->get_lanes(), in, out, io_last_dims, io_data_sizes);
const auto& vector_region = std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(vector_tile),
std::make_pair(std::vector<size_t>{}, std::vector<size_t>{}));
const auto& scalar_tile = std::make_shared<ngraph::snippets::op::Tile>(scalar_lowered, 1, in, out, io_last_dims, io_data_sizes);
const auto& scalar_region = std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(scalar_tile),
std::make_pair(std::vector<size_t>{}, std::vector<size_t>{}));
OV_ITT_TASK_NEXT(GENERATE, "::Tiles2D")
// wrapping into tiles2D
std::vector<std::pair<std::shared_ptr<Emitter>, RegInfo>> tiles2D;
tile = std::make_shared<ngraph::snippets::op::Tile>(tiles1D);
tile->compile_params = compile_params;
tiles2D.push_back(std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(tile),
std::make_pair(std::vector<size_t>({1, 0, nptrs, 0}), std::vector<size_t>{})));
auto tile_scheduler = std::make_shared<ngraph::snippets::op::TileScheduler>(vector_region, scalar_region);
tile_scheduler->compile_params = compile_params;
const auto& tile_scheduler_region = std::make_pair(target->get(ngraph::snippets::op::TileScheduler::get_type_info_static())(tile_scheduler),
std::make_pair(std::vector<size_t>({in, out, target->get_lanes()}), std::vector<size_t>{}));
OV_ITT_TASK_NEXT(GENERATE, "::EmitCode")
// emission
auto tiles2DKernel = std::make_shared<ngraph::snippets::op::Kernel>(tiles2D);
auto tiles2DKernel = std::make_shared<ngraph::snippets::op::Kernel>(std::vector<AllocatedEmitter> {tile_scheduler_region});
tiles2DKernel->compile_params = compile_params;
std::shared_ptr<Emitter> kernel = target->get(ngraph::snippets::op::Kernel::get_type_info_static())(tiles2DKernel);
kernel->emit_code({in, out}, {});

View File

@ -1,10 +0,0 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets/op/blockedload.hpp"
using namespace ngraph;
snippets::op::BlockedLoad::BlockedLoad(const Output<Node>& x) : Load(x) {
}

View File

@ -0,0 +1,19 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets/itt.hpp"
#include "snippets/op/convert_saturation.hpp"
#include "ngraph/runtime/host_tensor.hpp"
ngraph::snippets::op::ConvertSaturation::ConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type)
: ov::op::v0::Convert({x}, destination_type) {
}
std::shared_ptr<ngraph::Node> ngraph::snippets::op::ConvertSaturation::clone_with_new_inputs(const OutputVector& new_args) const {
INTERNAL_OP_SCOPE(ConvertSaturation_clone_with_new_inputs);
check_new_args_count(this, new_args);
return std::make_shared<ConvertSaturation>(new_args.at(0), m_destination_type);
}

View File

@ -0,0 +1,19 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets/itt.hpp"
#include "snippets/op/convert_truncation.hpp"
#include "ngraph/runtime/host_tensor.hpp"
ngraph::snippets::op::ConvertTruncation::ConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type)
: ov::op::v0::Convert({x}, destination_type) {
}
std::shared_ptr<ngraph::Node> ngraph::snippets::op::ConvertTruncation::clone_with_new_inputs(const OutputVector& new_args) const {
INTERNAL_OP_SCOPE(ConvertTruncation_clone_with_new_inputs);
check_new_args_count(this, new_args);
return std::make_shared<ConvertTruncation>(new_args.at(0), m_destination_type);
}

View File

@ -11,7 +11,7 @@
using namespace std;
using namespace ngraph;
snippets::op::Load::Load(const Output<Node>& x) : Op({x}) {
snippets::op::Load::Load(const Output<Node>& x, const size_t count) : Op({x}), m_count(count) {
constructor_validate_and_infer_types();
}
@ -22,7 +22,7 @@ bool snippets::op::Load::visit_attributes(AttributeVisitor& visitor) {
std::shared_ptr<Node> snippets::op::Load::clone_with_new_inputs(const OutputVector& new_args) const {
INTERNAL_OP_SCOPE(Load);
check_new_args_count(this, new_args);
return std::make_shared<Load>(new_args.at(0));
return std::make_shared<Load>(new_args.at(0), m_count);
}
void snippets::op::Load::validate_and_infer_types() {

View File

@ -1,10 +0,0 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets/op/scalarload.hpp"
using namespace ngraph;
snippets::op::ScalarLoad::ScalarLoad(const Output<Node>& x) : Load(x) {
}

View File

@ -1,10 +0,0 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets/op/scalarstore.hpp"
using namespace ngraph;
snippets::op::ScalarStore::ScalarStore(const Output<Node>& x) : Store(x) {
}

View File

@ -4,14 +4,14 @@
#include <snippets/itt.hpp>
#include "snippets/op/scalarstore.hpp"
#include "snippets/op/store.hpp"
#include <ngraph/runtime/host_tensor.hpp>
using namespace std;
using namespace ngraph;
snippets::op::Store::Store(const Output<Node>& x) : Op({x}) {
snippets::op::Store::Store(const Output<Node>& x, const size_t count) : Op({x}), m_count(count) {
constructor_validate_and_infer_types();
}
@ -22,7 +22,7 @@ bool snippets::op::Store::visit_attributes(AttributeVisitor& visitor) {
std::shared_ptr<Node> snippets::op::Store::clone_with_new_inputs(const OutputVector& new_args) const {
INTERNAL_OP_SCOPE(Store);
check_new_args_count(this, new_args);
return std::make_shared<Store>(new_args.at(0));
return std::make_shared<Store>(new_args.at(0), m_count);
}
void snippets::op::Store::validate_and_infer_types() {

View File

@ -6,6 +6,7 @@
#include "snippets/remarks.hpp"
#include "snippets/op/subgraph.hpp"
#include "snippets/op/convert_saturation.hpp"
#include "snippets/pass/insert_load_store.hpp"
#include "snippets/pass/insert_movebroadcast.hpp"
#include "snippets/pass/load_movebroadcast_to_broadcastload.hpp"
@ -13,8 +14,15 @@
#include "snippets/pass/convert_constants_to_scalars.hpp"
#include "snippets/pass/convert_power_to_powerstatic.hpp"
#include "snippets/pass/vector_to_scalar.hpp"
#include "snippets/pass/transform_convert_to_truncation.hpp"
#include "snippets/pass/insert_convert_on_inputs.hpp"
#include "snippets/pass/reset_type_relaxed_node_precision.hpp"
#include "transformations/common_optimizations/nop_elimination.hpp"
#include "transformations/utils/utils.hpp"
#include <ngraph/pass/manager.hpp>
#include "ngraph/pass/constant_folding.hpp"
#include <openvino/pass/serialize.hpp>
#include <algorithm>
@ -92,6 +100,9 @@ auto snippets::op::Subgraph::wrap_node_as_subgraph(const std::shared_ptr<ov::Nod
auto body_node = node->clone_with_new_inputs(body_inputs);
body_node->set_friendly_name(node->get_friendly_name());
for (size_t i = 0; i < node->get_output_size(); i++) {
fill_empty_output_names(body_node->output(i), node->output(i));
}
if (node->get_output_size() != body_node->get_output_size()) {
throw ngraph::ngraph_error("original node outputs size and extracted subgraph node outputs size doesn't much");
@ -118,6 +129,20 @@ auto snippets::op::Subgraph::wrap_node_as_subgraph(const std::shared_ptr<ov::Nod
return subgraph;
}
void snippets::op::Subgraph::fill_empty_output_names(const Output<Node>& target_output_node, const Output<Node>& replacement_output_node) {
NGRAPH_SUPPRESS_DEPRECATED_START
auto out_tensor = target_output_node.get_tensor_ptr();
const std::string new_name = ngraph::op::util::get_ie_output_name(replacement_output_node);
if (out_tensor->get_name().empty()) {
out_tensor->set_name(new_name);
}
if (!replacement_output_node.get_names().empty()) {
out_tensor->set_names(replacement_output_node.get_names());
}
NGRAPH_SUPPRESS_DEPRECATED_END
}
///
/// \brief Canonization transforms original subgraph and to canonical form suitable for code generation. In particular,
/// it handles supported layout conversions, broadcasts inputs and outputs to a single rank and layout. Canonicalization
@ -125,6 +150,7 @@ auto snippets::op::Subgraph::wrap_node_as_subgraph(const std::shared_ptr<ov::Nod
/// Canonicalization currently supports only the following layout conversions:
/// * None: all inputs have the same layout
/// * Planar + blocked: some inputs have blocked, and some have planar layouts, e.g. <N, C, H, W, c> + <N, C, H, W>
/// Also there is precision aligning inside body of subgraph during canonicalization
Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes) {
INTERNAL_OP_SCOPE(Subgraph);
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::canonicalize")
@ -176,7 +202,8 @@ Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShape
PartialShape::broadcast_merge_into(tmpPShape, inShape, ::ngraph::op::AutoBroadcastType::NUMPY),
"Failed to create broadcastable shapes in snippets canonicalization");
const auto paramShape = m_body->get_parameters()[i]->get_shape();
if (paramShape.size() != inShape.size() || !equal(paramShape.begin(), paramShape.end(), inShape.begin()))
const auto paramType = m_body->get_parameters()[i]->get_element_type();
if (paramShape.size() != inShape.size() || !equal(paramShape.begin(), paramShape.end(), inShape.begin()) || paramType != inType)
m_body->replace_parameter(i, std::make_shared<opset1::Parameter>(inType, inShape));
}
@ -213,21 +240,80 @@ Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShape
::ngraph::op::AutoBroadcastType::NUMPY);
NODE_VALIDATION_CHECK(this, compatibleWithOtherOutputs, "Snippets output shapes must be numpy broadcastable");
}
// We should insert Converts after Parameters and Constant and before Results
// to align precision inside Subgraph body that is supported by Plugin
align_element_types(outputShapes, inputShapes);
exec_domain = outPShape.get_shape();
return exec_domain;
}
void snippets::op::Subgraph::align_element_types(const BlockedShapeVector& outputShapes,
const BlockedShapeVector& inputShapes) {
// TODO: At the moment snippets support execution in only one element type
const auto execution_element_type = ov::element::f32;
ngraph::pass::Manager p_manager;
p_manager.register_pass<snippets::pass::TransformConvertToConvertTruncation>();
p_manager.run_passes(m_body);
const auto& body_results = m_body->get_results();
for (size_t i = 0; i < outputShapes.size(); i++) {
const auto needed_out_type = std::get<2>(outputShapes[i]);
// If there is real Convert from graph (ConvertTruncation) before Result
// we should check destination type and insert ConvertSaturation before that if needed.
// For example, to return original element type after Convert insertion on inputs
std::shared_ptr<ov::Node> first_convert = body_results[i];
while (ov::is_type<ngraph::snippets::op::ConvertTruncation>(first_convert->get_input_node_ptr(0))) {
first_convert = first_convert->get_input_node_shared_ptr(0);
}
if (auto existing_convert_t = ngraph::as_type_ptr<ngraph::snippets::op::ConvertTruncation>(first_convert)) {
const auto original_input_element_type = existing_convert_t->get_input_element_type(0);
if (original_input_element_type != execution_element_type) {
const auto convert = std::make_shared<ngraph::snippets::op::ConvertSaturation>(
existing_convert_t->get_input_node_shared_ptr(0), original_input_element_type);
existing_convert_t->set_argument(0, convert);
}
}
// We should insert Convert before Results to return original output element type
const auto convert = std::make_shared<ngraph::snippets::op::ConvertSaturation>(
body_results[i]->get_input_node_shared_ptr(0), needed_out_type);
body_results[i]->set_argument(0, convert);
}
// After Convert insertion we should make the following steps:
// - insert ConvertSaturation after inputs and scalar to start aligning of exec data type inside body
// - manually set output element types of type relaxed nodes to align element type inside subgraph body
// - after Convert insertion on inputs and after scalars we should use ConstantFolding pass to convert
// element type of Scalars before inference
// - eliminate redundant Convert that could have been inserted
ngraph::pass::Manager manager;
manager.register_pass<snippets::pass::InsertConvertOnInputs>(execution_element_type);
manager.register_pass<snippets::pass::ResetTypeRelaxedNodePrecision>(execution_element_type);
manager.register_pass<ngraph::pass::ConstantFolding>();
manager.register_pass<ngraph::pass::EliminateConvert>();
manager.run_passes(m_body);
}
void snippets::op::Subgraph::convert_to_snippet_dialect() {
INTERNAL_OP_SCOPE(Subgraph);
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::convert_to_snippet_dialect")
auto skip_matching_domain = [](const std::shared_ptr<const ov::Node>& n) -> bool {
return n->get_input_shape(0).back() != 1;
};
// At the moment we support only full vector Load/Store and scalar Load/Store so that count is equal to lanes.
// Then we are going to support variadic Load/Store with different element count
const size_t count = m_generator->get_target_machine()->get_lanes();
ngraph::pass::Manager manager;
manager.register_pass<snippets::pass::ConvertConstantsToScalars>();
manager.register_pass<snippets::pass::ConvertPowerToPowerStatic>();
manager.register_pass<snippets::pass::InsertLoad>();
manager.register_pass<snippets::pass::InsertStore>();
manager.register_pass<snippets::pass::InsertLoad>(count);
manager.register_pass<snippets::pass::InsertStore>(count);
manager.register_pass<snippets::pass::InsertMoveBroadcast>();
manager.register_pass<snippets::pass::LoadMoveBroadcastToBroadcastLoad>();
// Note that, BrodacastMove is typically inserted right after the Load. Such cases are typical for
@ -246,12 +332,12 @@ void snippets::op::Subgraph::convert_to_snippet_dialect() {
// Result
// Note: Load* should be replaced with ScalarLoad in this example to avoid invalid read in vector Tile.
if (!exec_domain.empty() && exec_domain.back() != 1) {
manager.register_pass<snippets::pass::ReplaceLoadsWithScalarLoads>();
manager.register_pass<snippets::pass::ReplaceStoresWithScalarStores>();
manager.register_pass<snippets::pass::SetScalarCountForLoad>();
manager.register_pass<snippets::pass::SetScalarCountForStore>();
manager.get_pass_config()->
set_callback<ngraph::snippets::pass::ReplaceLoadsWithScalarLoads>(skip_matching_domain);
set_callback<ngraph::snippets::pass::SetScalarCountForLoad>(skip_matching_domain);
manager.get_pass_config()->
set_callback<ngraph::snippets::pass::ReplaceStoresWithScalarStores>(skip_matching_domain);
set_callback<ngraph::snippets::pass::SetScalarCountForStore>(skip_matching_domain);
}
manager.run_passes(m_body);
}

View File

@ -8,5 +8,8 @@
using namespace std;
using namespace ngraph;
snippets::op::Tile::Tile(const std::vector<std::pair<std::shared_ptr<snippets::Emitter>, snippets::RegInfo>>& nested) : Op(), region(nested) {
snippets::op::Tile::Tile(const std::vector<AllocatedEmitter>& region, size_t increment,
size_t num_inputs, size_t num_outputs,
const std::vector<size_t>& io_dims, const std::vector<size_t>& io_data_sizes) :
Op(), region(region), increment(increment), num_inputs(num_inputs), num_outputs(num_outputs), io_dims(io_dims), io_data_size(io_data_sizes) {
}

View File

@ -0,0 +1,10 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets/op/tile_scheduler.hpp"
#include "snippets/generator.hpp"
ngraph::snippets::op::TileScheduler::TileScheduler(const AllocatedEmitter& vector_region, const AllocatedEmitter& scalar_region)
: Op(), vector_region{vector_region}, scalar_region{scalar_region} {
}

View File

@ -1,10 +0,0 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets/op/vectorload.hpp"
using namespace ngraph;
snippets::op::VectorLoad::VectorLoad(const Output<Node>& x) : Load(x) {
}

View File

@ -1,10 +0,0 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets/op/vectorstore.hpp"
using namespace ngraph;
snippets::op::VectorStore::VectorStore(const Output<Node>& x) : Store(x) {
}

View File

@ -16,7 +16,6 @@
bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr<ov::Model>& f) {
RUN_ON_MODEL_SCOPE(AssignRegisters);
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::AssignRegisters")
int reg64_tmp_start { 8 }; // R8, R9, R10, R11, R12, R13, R14, R15 inputs+outputs+1
using Reg = size_t;
auto ops = f->get_ordered_ops();
decltype(ops) stmts;
@ -26,8 +25,8 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
size_t rdx = 0;
std::map<std::shared_ptr<descriptor::Tensor>, Reg> regs;
for (auto op : stmts) {
for (auto output : op->outputs()) {
for (const auto& op : stmts) {
for (const auto& output : op->outputs()) {
regs[output.get_tensor_ptr()] = rdx++;
}
}
@ -35,9 +34,9 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
std::vector<std::set<Reg>> used;
std::vector<std::set<Reg>> def;
for (auto op : stmts) {
for (const auto& op : stmts) {
std::set<Reg> u;
for (auto input : op->inputs()) {
for (const auto& input : op->inputs()) {
if (regs.count(input.get_tensor_ptr())) {
u.insert(regs[input.get_tensor_ptr()]);
}
@ -46,7 +45,7 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
std::set<Reg> d;
if (!std::dynamic_pointer_cast<snippets::op::Store>(op)) {
for (auto output : op->outputs()) {
for (const auto& output : op->outputs()) {
d.insert(regs[output.get_tensor_ptr()]);
}
}
@ -65,8 +64,8 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
for (size_t n = 0; n < stmts.size(); n++) {
auto node = stmts[n];
if (!std::dynamic_pointer_cast<snippets::op::Store>(node)) {
for (auto out : node->outputs()) {
for (auto port : out.get_target_inputs()) {
for (const auto& out : node->outputs()) {
for (const auto& port : out.get_target_inputs()) {
auto pos = std::find(stmts.begin(), stmts.end(), port.get_node()->shared_from_this());
if (pos != stmts.end()) {
auto k = pos-stmts.begin();
@ -136,46 +135,32 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
std::map<std::shared_ptr<descriptor::Tensor>, Reg> physical_regs;
for (auto reg : regs) {
for (const auto& reg : regs) {
physical_regs[reg.first] = register_map[reg.second];
}
size_t constantID = 0;
for (auto n : f->get_ordered_ops()) {
const auto num_parameters = f->get_parameters().size();
for (const auto& n : f->get_ordered_ops()) {
auto& rt = n->get_rt_info();
// nothing to do for model signature
if (std::dynamic_pointer_cast<opset1::Parameter>(n) || std::dynamic_pointer_cast<opset1::Result>(n)) {
std::vector<size_t> regs;
regs.reserve(n->outputs().size());
/* The main idea here is that each operation stores its output regs in rt["reginfo"]. Input and output regs are
* then derived by parsing node's and parent's rt["reginfo"], look into ngraph::snippets::getRegisters for details.
* Note also that Parameter and Result store general-purpose register index, because they work with memory
* (memory pointer is stored in gpr). All other "regular" ops store vector regs indexes, since calculations are
* performed on registers.
*/
if (is_type<ov::op::v0::Result>(n)) {
continue;
}
// store only effective address
if (auto result = std::dynamic_pointer_cast<snippets::op::Store>(n)) {
auto ea = reg64_tmp_start+static_cast<int64_t>(f->get_result_index(result) + f->get_parameters().size());
rt["effectiveAddress"] = ea;
continue;
}
// store effective address and procced with vector registers
if (ov::as_type_ptr<ngraph::snippets::op::Load>(n) || ov::as_type_ptr<ngraph::snippets::op::BroadcastLoad>(n)) {
auto source = n->get_input_source_output(0).get_node_shared_ptr();
if (auto param = ov::as_type_ptr<opset1::Parameter>(source)) {
auto ea = reg64_tmp_start+static_cast<int64_t>(f->get_parameter_index(param));
rt["effectiveAddress"] = ea;
} else if (auto constant = ov::as_type_ptr<opset1::Constant>(source)) {
auto ea = reg64_tmp_start+static_cast<int64_t>(f->get_parameters().size() + f->get_results().size() + 1 + constantID);
rt["effectiveAddress"] = ea;
constantID++;
} else {
throw ngraph_error("load/broadcast should follow only Parameter or non-Scalar constant");
} else if (const auto& param = ov::as_type_ptr<ov::op::v0::Parameter>(n)) {
regs.push_back(f->get_parameter_index(param));
} else if (const auto& store = ov::as_type_ptr<ngraph::snippets::op::Store>(n)) {
regs.push_back(f->get_result_index(store) + num_parameters);
} else {
for (const auto& output : n->outputs()) {
auto allocated = physical_regs[output.get_tensor_ptr()];
regs.push_back(allocated);
}
}
std::vector<size_t> regs; regs.reserve(n->outputs().size());
for (auto output : n->outputs()) {
auto allocated = physical_regs[output.get_tensor_ptr()];
regs.push_back(allocated);
}
rt["reginfo"] = regs;
}

View File

@ -99,15 +99,17 @@ auto is_layout_oblivious(const std::shared_ptr<const Node> &n) -> bool {
|| ov::is_type<opset1::Tanh>(n)
|| ov::is_type<ngraph::op::v0::Gelu>(n)
|| ov::is_type<ngraph::op::v7::Gelu>(n)
|| ov::is_type<ngraph::op::v4::HSwish>(n);
|| ov::is_type<ngraph::op::v4::HSwish>(n)
|| ov::is_type<ngraph::op::v0::Convert>(n);
};
return is_layout_oblivious_unary(n) || is_layout_oblivious_binary(n);
}
auto has_supported_in_out(const std::shared_ptr<const Node> &n) -> bool {
auto supported = [](descriptor::Tensor& t) -> bool {
return t.get_element_type() == ngraph::element::f32 &&
t.get_partial_shape().is_static();
static const std::set<ngraph::element::Type> supported_data_types =
{ ngraph::element::f32, ngraph::element::i32, ngraph::element::bf16, ngraph::element::i8, ngraph::element::u8 };
return t.get_partial_shape().is_static() && supported_data_types.count(t.get_element_type()) != 0;
};
const auto & inputs = n->inputs();
const auto & outputs = n->outputs();
@ -148,19 +150,9 @@ auto update_out_tensor_name(std::shared_ptr<ngraph::snippets::op::Subgraph> &sub
for (unsigned int i = 0; i < subgraph->get_output_size() && not_set; i++) {
for (const auto &in : subgraph->get_output_target_inputs(i)) {
if (ov::is_type<opset1::Result>(in.get_node())) {
auto out_tensor = subgraph->output(i).get_tensor_ptr();
NGRAPH_SUPPRESS_DEPRECATED_START
if (out_tensor->get_name().empty()) {
const auto& body_result = subgraph->get_body()->get_output_op(i);
const auto& body_result_input = body_result->get_input_source_output(0);
// Note that create_ie_output_name() checks only deprecated output.get_tensor().get_name()
// However output.get_tensor().get_names() should also be updated
if (!body_result_input.get_names().empty())
out_tensor->add_names(body_result_input.get_names());
std::string newTensorName = ngraph::op::util::get_ie_output_name(body_result_input);
out_tensor->set_name(newTensorName);
}
NGRAPH_SUPPRESS_DEPRECATED_END
const auto& body_result = subgraph->get_body()->get_output_op(i);
const auto& body_result_input = body_result->get_input_source_output(0);
op::Subgraph::fill_empty_output_names(subgraph->output(i), body_result_input);
not_set = false;
break;
}
@ -406,6 +398,40 @@ TokenizeSnippets::TokenizeSnippets() {
auto& input_body = clones[input_node];
size_t source_output_index = input_value.get_index();
auto source_result = input_body->get_results()[source_output_index];
// We cannot add new node, that is not Convert, after Convert (that is start node) to avoid arithmetic problems with conversion
// We can add any new node in Subgraph after Convert (bacause after Input)
// Parameter
// |
// Convert
//
// We cannot add new node, that isn't Convert, in Subgraph after existing Convert
// Parameter
// Relu
// Convert
//
// But we can add new Convert in Subgraph after existing Convert
// Parameter
// Relu
// Convert
// Convert
//
// Thus, We can grow subgraph only if Convert is the first node of subgraph and have to abort it's the last one and we want to add not Convert
// We have this limitation because at the moment we support only one execution precision inside body, so
// if there is Convert with input and output data types that aren't equal to supported exec type,
// we can get conversion math errors
const auto output_of_subgraph = source_result->get_input_node_shared_ptr(0);
if (!ov::is_type<ngraph::op::v0::Convert>(node) && ov::is_type<ngraph::op::v0::Convert>(output_of_subgraph)) {
// Also we can add new node after < Parameter -> Convert -> Convert -> Convert >
auto grandparent = output_of_subgraph->get_input_node_ptr(0);
while (ov::is_type<ngraph::op::v0::Convert>(grandparent)) {
grandparent = grandparent->get_input_node_ptr(0);
}
if (!ov::is_type<ngraph::op::v0::Parameter>(grandparent)) {
return abort_with_strategy("Convert supports only as Input and as Result of subgraph. Aborting");
}
}
// Result op has a single input
internal_inputs.push_back(source_result->input_value(0));
} else {
@ -477,7 +503,7 @@ TokenizeSnippets::TokenizeSnippets() {
throw ngraph_error("body results and node results size mismatch during subgraph collaps");
}
// todo: move this plugin-specific constraint to the plugin callback
if (body_parameters.size() + body_results.size() > 7) {
if (body_parameters.size() + body_results.size() > 12) {
const std::string message_reset = "new subgraph is created. Impossible to schedule subgraph with " +
std::to_string(body_parameters.size()) + " inputs and " + std::to_string(body_results.size()) + " outputs.";
const std::string message_abort = "failed to continue subgraph. Impossible to schedule subgraph with " +

View File

@ -0,0 +1,72 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <snippets/itt.hpp>
#include "snippets/remarks.hpp"
#include "snippets/pass/insert_convert_on_inputs.hpp"
#include "snippets/snippets_isa.hpp"
#include "ngraph/type.hpp"
#include "ngraph/node.hpp"
#include <ngraph/opsets/opset1.hpp>
#include <ngraph/rt_info.hpp>
#include <ngraph/pattern/op/wrap_type.hpp>
#include <ngraph/pattern/op/or.hpp>
// We should recursivelly (after full sequences of ConvertTruncation) go through inputs and
// insert ConvertSaturation with supported element type before eltwises
// NOTE: JUST EXAMPLE:
// Parameter I8
// ConvertTruncation U8
// / | \
// ConvertTruncation F32 ConvertTruncation I32 ConvertTruncation BF16
// Eltwise ConvertSaturation FP32 ConvertTruncation I32
// <> Eltwise ConvertSaturation FP32
// <> Eltwise
bool insertConvertSaturationAfterNode(const std::shared_ptr<ov::Node>& node, const ov::element::Type element_type) {
bool rewritten = false;
for (const auto& output : node->outputs()) {
for (auto consumer : output.get_target_inputs()) {
const auto output_shared_node = consumer.get_node()->shared_from_this();
// Go down through ConvertTruncation sequence
if (auto existing_convert_t = ov::as_type_ptr<ngraph::snippets::op::ConvertTruncation>(output_shared_node)) {
rewritten = insertConvertSaturationAfterNode(existing_convert_t, element_type);
continue;
}
// Check if ConvertSaturation already exists with supported element type or not and insert ConvertSaturation with supported element type
auto existing_convert_s = ov::as_type_ptr<ngraph::snippets::op::ConvertSaturation>(output_shared_node);
if ((!existing_convert_s && !ov::is_type<ov::op::v0::Result>(output_shared_node) && consumer.get_element_type() != element_type) ||
(existing_convert_s && existing_convert_s->get_destination_type() != element_type)) {
const auto convert = std::make_shared<ngraph::snippets::op::ConvertSaturation>(node, element_type);
consumer.replace_source_output(convert);
rewritten |= true;
}
}
}
return rewritten;
}
ngraph::snippets::pass::InsertConvertOnInputs::InsertConvertOnInputs(const ov::element::Type exec_type) {
MATCHER_SCOPE(InsertConvertOnInputs);
auto param_pattern = ngraph::pattern::wrap_type<ngraph::opset1::Parameter>();
auto scalar_pattern = pattern::wrap_type<opset1::Constant>(
[=](Output<Node> output) -> bool { return ngraph::shape_size(output.get_shape()) == 1; });
auto input = std::make_shared<pattern::op::Or>(OutputVector{ param_pattern, scalar_pattern });
ngraph::matcher_pass_callback callback = [this, exec_type](ngraph::pattern::Matcher& m) {
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertConvertOnInputs")
auto root = m.get_match_root();
auto rewritten = insertConvertSaturationAfterNode(root, exec_type);
return rewritten;
};
auto m = std::make_shared<ngraph::pattern::Matcher>(input, matcher_name);
register_matcher(m, callback);
}

View File

@ -12,11 +12,11 @@
#include <ngraph/rt_info.hpp>
#include <ngraph/pattern/op/wrap_type.hpp>
ngraph::snippets::pass::InsertLoad::InsertLoad() {
ngraph::snippets::pass::InsertLoad::InsertLoad(const size_t count) {
MATCHER_SCOPE(InsertLoad);
register_matcher(std::make_shared<ngraph::pattern::Matcher>(
ngraph::pattern::wrap_type<ngraph::opset1::Parameter>(), matcher_name),
[this](ngraph::pattern::Matcher &m) {
[this, count](ngraph::pattern::Matcher &m) {
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertLoad")
auto root = m.get_match_root();
@ -29,7 +29,7 @@ ngraph::snippets::pass::InsertLoad::InsertLoad() {
}
}
auto load = std::make_shared<ngraph::snippets::op::Load> (root);
auto load = std::make_shared<ngraph::snippets::op::Load>(root, count);
ngraph::copy_runtime_info(root, load);
bool rewritten = false;
@ -46,11 +46,11 @@ ngraph::snippets::pass::InsertLoad::InsertLoad() {
});
}
ngraph::snippets::pass::InsertStore::InsertStore() {
ngraph::snippets::pass::InsertStore::InsertStore(const size_t count) {
MATCHER_SCOPE(InsertStore);
register_matcher(std::make_shared<ngraph::pattern::Matcher>(
ngraph::pattern::wrap_type<ngraph::opset1::Result>(), matcher_name),
[this](ngraph::pattern::Matcher &m) {
[this, count](ngraph::pattern::Matcher &m) {
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertStore")
auto root = m.get_match_root();
@ -61,7 +61,7 @@ ngraph::snippets::pass::InsertStore::InsertStore() {
}
}
auto store = std::make_shared<ngraph::snippets::op::Store> (root->input_value(0));
auto store = std::make_shared<ngraph::snippets::op::Store> (root->input_value(0), count);
ngraph::copy_runtime_info(root, store);
root->set_argument(0, store);
return true;

View File

@ -15,7 +15,7 @@
ngraph::snippets::pass::LoadMoveBroadcastToBroadcastLoad::LoadMoveBroadcastToBroadcastLoad() {
MATCHER_SCOPE(LoadMoveBroadcastToBroadcastLoad);
auto param_pattern = ngraph::pattern::wrap_type<ngraph::opset1::Parameter>();
auto load_pattern = std::make_shared<ngraph::snippets::op::Load>(param_pattern);
auto load_pattern = ngraph::pattern::wrap_type<ngraph::snippets::op::Load>({param_pattern});
auto fbn = std::make_shared<ngraph::snippets::op::BroadcastMove>(load_pattern, Shape{1});
register_matcher(std::make_shared<ngraph::pattern::Matcher>(fbn, matcher_name),

View File

@ -0,0 +1,31 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <snippets/itt.hpp>
#include "snippets/op/convert_saturation.hpp"
#include "snippets/pass/reset_type_relaxed_node_precision.hpp"
#include "ngraph_ops/type_relaxed.hpp"
#include <ngraph/rt_info.hpp>
ngraph::snippets::pass::ResetTypeRelaxedNodePrecision::ResetTypeRelaxedNodePrecision(const ov::element::Type exec_type) : exec_type(exec_type) { }
bool ngraph::snippets::pass::ResetTypeRelaxedNodePrecision::run_on_model(const std::shared_ptr<ov::Model> &m) {
RUN_ON_FUNCTION_SCOPE(ResetTypeRelaxedNodePrecision);
bool rewritten = false;
for (auto& op : m->get_ordered_ops()) {
if (auto node = std::dynamic_pointer_cast<ngraph::op::TypeRelaxedBase>(op)) {
for (int i = 0; i < op->outputs().size(); i++) {
node->set_overridden_output_type(exec_type, i);
rewritten |= true;
}
} else {
op->validate_and_infer_types();
}
}
return rewritten;
}

View File

@ -0,0 +1,34 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets/remarks.hpp"
#include <snippets/itt.hpp>
#include "snippets/pass/transform_convert_to_truncation.hpp"
#include "snippets/snippets_isa.hpp"
#include <ngraph/opsets/opset1.hpp>
#include <ngraph/rt_info.hpp>
#include <ngraph/pattern/op/wrap_type.hpp>
ngraph::snippets::pass::TransformConvertToConvertTruncation::TransformConvertToConvertTruncation() {
MATCHER_SCOPE(TransformConvertToConvertTruncation);
register_matcher(std::make_shared<ngraph::pattern::Matcher>(
ngraph::pattern::wrap_type<ngraph::opset1::Convert>()),
[this](ngraph::pattern::Matcher &m) {
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::TransformConvertToConvertTruncation")
const auto root = m.get_match_root();
const auto convert = ngraph::as_type_ptr<ngraph::opset1::Convert>(root);
if (!convert)
return false;
auto convert_truncation = std::make_shared<op::ConvertTruncation>(convert->get_input_source_output(0),
convert->get_destination_type());
convert_truncation->set_friendly_name(convert->get_friendly_name());
ngraph::copy_runtime_info(convert, convert_truncation);
ngraph::replace_node(convert, convert_truncation);
return true;
});
}

View File

@ -7,40 +7,43 @@
#include "snippets/pass/vector_to_scalar.hpp"
#include "snippets/snippets_isa.hpp"
#include <ngraph/opsets/opset1.hpp>
#include <ngraph/rt_info.hpp>
#include <ngraph/pattern/op/wrap_type.hpp>
ngraph::snippets::pass::ReplaceLoadsWithScalarLoads::ReplaceLoadsWithScalarLoads() {
MATCHER_SCOPE(ReplaceLoadsWithScalarLoads);
ngraph::snippets::pass::SetScalarCountForLoad::SetScalarCountForLoad() {
MATCHER_SCOPE(SetScalarCountForLoad);
register_matcher(std::make_shared<ngraph::pattern::Matcher>(
ngraph::pattern::wrap_type<ngraph::snippets::op::Load>(), matcher_name),
[this](ngraph::pattern::Matcher &m) {
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ReplaceLoadsWithScalarLoads_callback")
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::SetScalarCountForLoad_callback")
auto root = m.get_match_root();
if (transformation_callback(root))
return false;
auto load = std::make_shared<ngraph::snippets::op::ScalarLoad> (root->input_value(0));
load->set_friendly_name(root->get_friendly_name());
ngraph::copy_runtime_info(root, load);
ngraph::replace_node(root, load);
const auto load = ov::as_type_ptr<ngraph::snippets::op::Load>(root);
if (!load)
return false;
load->set_count(1lu);
return true;
});
}
ngraph::snippets::pass::ReplaceStoresWithScalarStores::ReplaceStoresWithScalarStores() {
MATCHER_SCOPE(ReplaceStoresWithScalarStores);
ngraph::snippets::pass::SetScalarCountForStore::SetScalarCountForStore() {
MATCHER_SCOPE(SetScalarCountForStore);
register_matcher(std::make_shared<ngraph::pattern::Matcher>(
ngraph::pattern::wrap_type<ngraph::snippets::op::Store>(), matcher_name),
[this](ngraph::pattern::Matcher &m) {
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ReplaceStoresWithScalarStores_callback")
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::SetScalarCountForStore_callback")
auto root = m.get_match_root();
if (transformation_callback(root))
return false;
auto store = std::make_shared<ngraph::snippets::op::ScalarStore> (root->input_value(0));
store->set_friendly_name(root->get_friendly_name());
ngraph::copy_runtime_info(root, store);
ngraph::replace_node(root, store);
const auto store = ov::as_type_ptr<ngraph::snippets::op::Store>(root);
if (!store)
return false;
store->set_count(1lu);
return true;
});
}

View File

@ -29,7 +29,7 @@ public:
DummyTargetMachine();
bool is_supported() const override { return true; }
ngraph::snippets::code get_snippet() const override { return nullptr; }
size_t get_lanes() const override { return 1; }
size_t get_lanes() const override { return 10; }
};
class DummyGenerator : public ngraph::snippets::Generator {

View File

@ -0,0 +1,40 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "lowering_utils.hpp"
#include "snippets_helpers.hpp"
/* The main purpose is to test that:
* - Load/Store ops are inserted
* - Load + BroadcastMove fuses to BroadcastLoad (not the main focus, but still had to cover; overlays with insert_movebroadcast.cpp)
* - Proper Load/Stores are converted to scalar form to avoid invalid memory access by vector tile
* (temporary disabled, since corresponding PR is not merged yet)
*/
namespace ov {
namespace test {
namespace snippets {
typedef std::tuple<
Shape, // Input shape 0
Shape, // Input shape 1
Shape, // Input shape 2
Shape, // Broadcast shape 0
Shape, // Broadcast shape 1
Shape // Broadcast shape 2
> insertLoadStoreParams;
class InsertLoadStoreTests : public LoweringTests, public testing::WithParamInterface<insertLoadStoreParams> {
public:
static std::string getTestCaseName(testing::TestParamInfo<insertLoadStoreParams> obj);
protected:
void SetUp() override;
std::shared_ptr<SnippetsFunctionBase> snippets_function;
};
} // namespace snippets
} // namespace test
} // namespace ov

View File

@ -23,18 +23,15 @@ DummyTargetMachine::DummyTargetMachine() {
jitters[op::v1::Multiply::get_type_info_static()] = dummy_functor;
jitters[op::v1::Multiply::get_type_info_static()] = dummy_functor;
jitters[ngraph::snippets::op::Load::get_type_info_static()] = dummy_functor;
jitters[ngraph::snippets::op::VectorLoad::get_type_info_static()] = dummy_functor;
jitters[ngraph::snippets::op::ScalarLoad::get_type_info_static()] = dummy_functor;
jitters[ngraph::snippets::op::BroadcastLoad::get_type_info_static()] = dummy_functor;
jitters[ngraph::snippets::op::Store::get_type_info_static()] = dummy_functor;
jitters[ngraph::snippets::op::VectorStore::get_type_info_static()] = dummy_functor;
jitters[ngraph::snippets::op::ScalarStore::get_type_info_static()] = dummy_functor;
jitters[ngraph::snippets::op::Scalar::get_type_info_static()] = dummy_functor;
jitters[ngraph::snippets::op::BroadcastMove::get_type_info_static()] = dummy_functor;
jitters[ngraph::snippets::op::Kernel::get_type_info_static()] = dummy_functor;
jitters[ngraph::snippets::op::Tile::get_type_info_static()] = dummy_functor;
jitters[ngraph::snippets::op::TileScheduler::get_type_info_static()] = dummy_functor;
}
std::shared_ptr<ngraph::snippets::op::Subgraph> LoweringTests::getSubgraph(const std::shared_ptr<Model>& f) {

View File

@ -49,6 +49,7 @@ TEST_P(CanonicalizationTests, Add) {
function = snippets_function->getOriginal();
function_ref = snippets_function->getReference();
auto subgraph = getTokenizedSubgraph(function);
subgraph->set_generator(std::make_shared<DummyGenerator>());
Shape canonical_output_shape = subgraph->canonicalize(output_blocked_shapes, input_blocked_shapes);
ASSERT_DIMS_EQ(canonical_output_shape, expected_output_shape);
}

View File

@ -5,6 +5,7 @@
#include <gtest/gtest.h>
#include <pass/collapse_subgraph.hpp>
#include <subgraph_simple.hpp>
#include <subgraph_converts.hpp>
#include "snippets/pass/collapse_subgraph.hpp"
namespace ov {
@ -39,6 +40,43 @@ TEST_F(CollapseSubgraphTests, smoke_Snippets_AvoidLoopEltwise) {
run();
}
TEST_F(CollapseSubgraphTests, smoke_Snippets_OneConvert) {
const auto &f = ConvertFunction(std::vector<Shape>{{2, 5}});
function = f.getOriginal();
function_ref = f.getReference();
run();
}
TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertInput) {
const auto &f = ConvertInputFunction(std::vector<Shape>{{2, 5}, {1, 5}});
function = f.getOriginal();
function_ref = f.getReference();
run();
}
TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertOutput) {
const auto &f = ConvertOutputFunction(std::vector<Shape>{{2, 5}, {1, 5}});
function = f.getOriginal();
function_ref = f.getReference();
run();
}
TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertStub) {
const auto &f = ConvertStubFunction(std::vector<Shape>{{2, 5, 2}, {1, 5, 1}});
function = f.getOriginal();
function_ref = f.getReference();
run();
}
TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertPartialInputsAndResults) {
const auto &f = ConvertPartialInputsAndResultsFunction(std::vector<Shape>{{2, 5, 1}, {1, 5, 1}, {2, 1, 10}},
std::vector<ov::element::Type>{ov::element::i8, ov::element::bf16, ov::element::f32},
std::vector<ov::element::Type>{ov::element::f32, ov::element::i8});
function = f.getOriginal();
function_ref = f.getReference();
run();
}
} // namespace snippets
} // namespace test
} // namespace ov

View File

@ -19,56 +19,81 @@ using namespace ngraph;
// todo: Rewrite this test using Snippets test infrastructure. See ./include/canonicalization.hpp for example
TEST(TransformationTests, ReplaceLoadsWithScalarLoads) {
std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
{
auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 2});
auto load = std::make_shared<snippets::isa::Load>(data);
auto neg = std::make_shared<opset1::Negative>(load);
auto store = std::make_shared<snippets::isa::Store>(neg);
f = std::make_shared<Function>(NodeVector{store}, ParameterVector{data});
pass::Manager m;
m.register_pass<pass::InitNodeInfo>();
m.register_pass<snippets::pass::ReplaceLoadsWithScalarLoads>();
m.run_passes(f);
ASSERT_NO_THROW(check_rt_info(f));
template<typename T>
size_t get_count(const std::shared_ptr<Function>& f, const std::string& name) {
size_t load_count = std::numeric_limits<size_t>::max();
for (auto op : f->get_ops()) {
if (op->get_friendly_name() == name) {
load_count = ov::as_type_ptr<T>(op)->get_count();
}
}
{
auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 2});
auto load = std::make_shared<snippets::isa::ScalarLoad>(data);
auto neg = std::make_shared<opset1::Negative>(load);
auto store = std::make_shared<snippets::isa::Store>(neg);
f_ref = std::make_shared<Function>(NodeVector{store}, ParameterVector{data});
}
auto res = compare_functions(f, f_ref);
ASSERT_TRUE(res.first) << res.second;
return load_count;
}
TEST(TransformationTests, ReplaceStoresWithScalarStores) {
TEST(TransformationTests, SetScalarCountForLoad) {
std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
const auto count = 16;
{
auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 2});
auto load = std::make_shared<snippets::isa::Load>(data);
auto load = std::make_shared<snippets::isa::Load>(data, count);
load->set_friendly_name("load");
auto neg = std::make_shared<opset1::Negative>(load);
auto store = std::make_shared<snippets::isa::Store>(neg);
auto store = std::make_shared<snippets::isa::Store>(neg, count);
f = std::make_shared<Function>(NodeVector{store}, ParameterVector{data});
pass::Manager m;
m.register_pass<pass::InitNodeInfo>();
m.register_pass<snippets::pass::ReplaceStoresWithScalarStores>();
m.register_pass<snippets::pass::SetScalarCountForLoad>();
m.run_passes(f);
ASSERT_NO_THROW(check_rt_info(f));
}
{
auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 2});
auto load = std::make_shared<snippets::isa::Load>(data);
auto load = std::make_shared<snippets::isa::Load>(data, 1lu);
load->set_friendly_name("load_ref");
auto neg = std::make_shared<opset1::Negative>(load);
auto store = std::make_shared<snippets::isa::ScalarStore>(neg);
auto store = std::make_shared<snippets::isa::Store>(neg, count);
f_ref = std::make_shared<Function>(NodeVector{store}, ParameterVector{data});
}
auto res = compare_functions(f, f_ref);
ASSERT_TRUE(res.first) << res.second;
auto load_count = get_count<ngraph::snippets::op::Load>(f, "load");
auto load_count_ref = get_count<ngraph::snippets::op::Load>(f_ref, "load_ref");
ASSERT_EQ(load_count, load_count_ref);
}
TEST(TransformationTests, SetScalarCountForStore) {
std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
const auto count = 16;
{
auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 2});
auto load = std::make_shared<snippets::isa::Load>(data, count);
auto neg = std::make_shared<opset1::Negative>(load);
auto store = std::make_shared<snippets::isa::Store>(neg, count);
store->set_friendly_name("store");
f = std::make_shared<Function>(NodeVector{store}, ParameterVector{data});
pass::Manager m;
m.register_pass<pass::InitNodeInfo>();
m.register_pass<snippets::pass::SetScalarCountForStore>();
m.run_passes(f);
ASSERT_NO_THROW(check_rt_info(f));
}
{
auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 2});
auto load = std::make_shared<snippets::isa::Load>(data, count);
auto neg = std::make_shared<opset1::Negative>(load);
auto store = std::make_shared<snippets::isa::Store>(neg, 1lu);
store->set_friendly_name("store_ref");
f_ref = std::make_shared<Function>(NodeVector{store}, ParameterVector{data});
}
auto res = compare_functions(f, f_ref);
ASSERT_TRUE(res.first) << res.second;
int64_t store_count = get_count<ngraph::snippets::op::Store>(f, "store");
int64_t store_count_ref = get_count<ngraph::snippets::op::Store>(f_ref, "store_ref");
ASSERT_EQ(store_count, store_count_ref);
}

View File

@ -25,12 +25,14 @@ TEST(TransformationTests, AssignRegisters) {
{
auto p0 = std::make_shared<opset1::Parameter>(element::f32, Shape(1));
auto p1 = std::make_shared<opset1::Parameter>(element::f32, Shape(1));
p0->set_friendly_name("p00");
p1->set_friendly_name("p01");
auto y00 = std::make_shared<snippets::isa::Load>(p0); y00->set_friendly_name("y00");
auto y01 = std::make_shared<snippets::isa::Load>(p1); y01->set_friendly_name("y01");
auto y02 = std::make_shared<opset1::Multiply>(y00, y01); y02->set_friendly_name("y02");
auto y03 = std::make_shared<snippets::isa::Store>(y02); y03->set_friendly_name("y03");
f = std::make_shared<Function>(NodeVector{y03}, ParameterVector{p0, p1});
auto s00 = std::make_shared<snippets::isa::Store>(y02); s00->set_friendly_name("y03");
s00->set_friendly_name("s00");
f = std::make_shared<Function>(NodeVector{s00}, ParameterVector{p0, p1});
pass::Manager m;
m.register_pass<pass::InitNodeInfo>();
@ -39,13 +41,17 @@ TEST(TransformationTests, AssignRegisters) {
ASSERT_NO_THROW(check_rt_info(f));
}
// instead of comparing to a reference function check that registers are correctly assigned
// and stored to runtime info
/* Instead of comparing to a reference function check that registers are correctly assigned and stored to runtime
* info. Note that Parameters and Store rt_info contains gpr indexes, while general op's rt_info contain vector
* indexes */
{
std::map<std::string, size_t> ref_registers {
{"p00", 0}, // gpr
{"p01", 1}, // gpr
{"y00", 0},
{"y01", 1},
{"y02", 2}
{"y02", 2},
{"s00", 2}, // gpr
};
auto total_ops = 0;
@ -75,6 +81,14 @@ TEST(TransformationTests, AssignRegisters2) {
auto p5 = std::make_shared<opset1::Parameter>(ngraph::element::f32, Shape());
auto p6 = std::make_shared<opset1::Parameter>(ngraph::element::f32, Shape());
auto p7 = std::make_shared<opset1::Parameter>(ngraph::element::f32, Shape());
p0->set_friendly_name("p00");
p1->set_friendly_name("p01");
p2->set_friendly_name("p02");
p3->set_friendly_name("p03");
p4->set_friendly_name("p04");
p5->set_friendly_name("p05");
p6->set_friendly_name("p06");
p7->set_friendly_name("p07");
auto c0 = std::make_shared<snippets::isa::Scalar>(ngraph::element::f32, Shape(), 3.14f); c0->set_friendly_name("r00");
auto c1 = std::make_shared<snippets::isa::Scalar>(ngraph::element::f32, Shape(), 6.6260701e-34f); c1->set_friendly_name("r01");
@ -102,9 +116,10 @@ TEST(TransformationTests, AssignRegisters2) {
auto y20 = std::make_shared<opset1::Add>(y17, y18); y20->set_friendly_name("r22");
auto y21 = std::make_shared<opset1::Add>(y15, y19); y21->set_friendly_name("r23");
auto y22 = std::make_shared<opset1::Add>(y20, y21); y22->set_friendly_name("r24");
auto y23 = std::make_shared<snippets::isa::Store>(y22);
auto s00 = std::make_shared<snippets::isa::Store>(y22);
s00->set_friendly_name("s00");
f = std::make_shared<Function>(NodeVector{y23}, ParameterVector{p0, p1, p2, p3, p4, p5, p6, p7});
f = std::make_shared<Function>(NodeVector{s00}, ParameterVector{p0, p1, p2, p3, p4, p5, p6, p7});
pass::Manager m;
m.register_pass<pass::InitNodeInfo>();
@ -117,10 +132,14 @@ TEST(TransformationTests, AssignRegisters2) {
// and stored to runtime info
{
std::map<std::string, size_t> ref_registers {
{"r00", 1}, {"r01", 3}, {"r02", 5}, {"r03", 5}, {"r04", 2}, {"r05", 6}, {"r06", 6}, {"r07", 6},
{"r08", 5}, {"r09", 2}, {"r10", 1}, {"r11", 4}, {"r12", 4}, {"r13", 6}, {"r14", 2}, {"r15", 5},
{"r16", 0}, {"r17", 4}, {"r18", 0}, {"r19", 2}, {"r20", 4}, {"r21", 1}, {"r22", 0}, {"r23", 6},
{"r24", 1}
{"p00", 0}, {"p01", 1}, {"p02", 2}, {"p03", 3}, {"p04", 4}, {"p05", 5},
{"p06", 6}, {"p07", 7},
{"r00", 1}, {"r01", 3}, {"r02", 5}, {"r03", 5}, {"r04", 2}, {"r05", 6},
{"r06", 6}, {"r07", 6}, {"r08", 5}, {"r09", 2}, {"r10", 1}, {"r11", 4},
{"r12", 4}, {"r13", 6}, {"r14", 2}, {"r15", 5}, {"r16", 0}, {"r17", 4},
{"r18", 0}, {"r19", 2}, {"r20", 4}, {"r21", 1}, {"r22", 0}, {"r23", 6},
{"r24", 1},
{"s00", 8},
};
auto total_ops = 0;

View File

@ -13,6 +13,10 @@
#include "jit_eltwise_emitters.hpp"
#include "jit_dnnl_emitters.hpp"
#include "jit_dnnl_ext_emitters.hpp"
#include "jit_conversion_emitters.hpp"
#include "snippets_transformations/op/load_convert.hpp"
#include "snippets_transformations/op/store_convert.hpp"
#include <ngraph/opsets/opset5.hpp>
@ -39,25 +43,25 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_
: TargetMachine(), h(new jit_snippet()), isa(host_isa) {
// data movement
jitters[ngraph::opset1::Parameter::get_type_info_static()] = CREATE_EMITTER(NopEmitter);
jitters[ngraph::snippets::op::BlockedParameter::get_type_info_static()] = CREATE_EMITTER(NopEmitter);
jitters[ngraph::opset1::Result::get_type_info_static()] = CREATE_EMITTER(NopEmitter);
// jitters[ngraph::opset1::Constant::get_type_info_static()] = CREATE_EMITTER(); // Not supported
jitters[ngraph::snippets::op::Load::get_type_info_static()] = CREATE_EMITTER(LoadEmitter);
jitters[ngraph::snippets::op::VectorLoad::get_type_info_static()] = CREATE_EMITTER(LoadEmitter);
jitters[ngraph::snippets::op::ScalarLoad::get_type_info_static()] = CREATE_EMITTER(ScalarLoadEmitter);
jitters[ngraph::snippets::op::BroadcastLoad::get_type_info_static()] = CREATE_EMITTER(BroadcastLoadEmitter);
jitters[ov::intel_cpu::LoadConvertSaturation::get_type_info_static()] = CREATE_EMITTER(LoadConvertEmitter);
jitters[ov::intel_cpu::LoadConvertTruncation::get_type_info_static()] = CREATE_EMITTER(LoadConvertEmitter);
jitters[ngraph::snippets::op::Store::get_type_info_static()] = CREATE_EMITTER(StoreEmitter);
jitters[ngraph::snippets::op::VectorStore::get_type_info_static()] = CREATE_EMITTER(StoreEmitter);
jitters[ngraph::snippets::op::ScalarStore::get_type_info_static()] = CREATE_EMITTER(ScalarStoreEmitter);
jitters[ov::intel_cpu::StoreConvertSaturation::get_type_info_static()] = CREATE_EMITTER(StoreConvertEmitter);
jitters[ov::intel_cpu::StoreConvertTruncation::get_type_info_static()] = CREATE_EMITTER(StoreConvertEmitter);
jitters[ngraph::snippets::op::Scalar::get_type_info_static()] = CREATE_EMITTER(ScalarEmitter);
jitters[ngraph::snippets::op::BroadcastMove::get_type_info_static()] = CREATE_EMITTER(FakeBroadcastEmitter);
jitters[ngraph::snippets::op::BroadcastMove::get_type_info_static()] = CREATE_EMITTER(BroadcastMoveEmitter);
// jitters[ngraph::snippets::op::Nop::get_type_info_static()] = CREATE_EMITTER(NopEmitter); // Not supported
// jitters[ngraph::opset1::Broadcast::get_type_info_static()] = CREATE_EMITTER(); // Not supported
// jitters[ngraph::opset1::Convert::get_type_info_static()] = CREATE_EMITTER(); // Not supported
jitters[ngraph::snippets::op::ConvertTruncation::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_convert_truncation_emitter);
jitters[ngraph::snippets::op::ConvertSaturation::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_convert_saturation_emitter);
// jitters[ngraph::opset1::FakeQuantize::get_type_info_static()] = CREATE_EMITTER(); // not supported
// binary
@ -118,6 +122,7 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_
jitters[ngraph::snippets::op::Kernel::get_type_info_static()] = CREATE_EMITTER(KernelEmitter);
jitters[ngraph::snippets::op::Tile::get_type_info_static()] = CREATE_EMITTER(TileEmitter);
jitters[ngraph::snippets::op::TileScheduler::get_type_info_static()] = CREATE_EMITTER(TileSchedulerEmitter);
}
size_t ov::intel_cpu::CPUTargetMachine::get_lanes() const {

View File

@ -0,0 +1,313 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "jit_conversion_emitters.hpp"
#include "utils/bfloat16.hpp"
#include <cpu/x64/jit_uni_eltwise.hpp>
#include <ngraph/opsets/opset1.hpp>
#include <nodes/eltwise.h>
using namespace InferenceEngine;
using namespace dnnl::impl::utils;
using namespace dnnl::impl;
using namespace dnnl::impl::cpu::x64;
using namespace Xbyak;
namespace ov {
namespace intel_cpu {
jit_convert_emitter::jit_convert_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr<ngraph::Node>& node, Precision exec_prc)
: jit_emitter(host, host_isa, node, exec_prc) {
input_type = node->get_input_element_type(0);
output_type = node->get_output_element_type(0);
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(host, host_isa));
}
void jit_convert_emitter::validate_types() const {
auto is_supported_type = [this](const ov::element::Type& type) {
return any_of(supported_types.begin(), supported_types.end(),
[&type](const ov::element::Type& supported_type) { return supported_type == type; } );
};
if (!is_supported_type(input_type))
IE_THROW() << "Unsupported input type: " << input_type.get_type_name();
if (!is_supported_type(output_type))
IE_THROW() << "Unsupported output type: " << output_type.get_type_name();
}
size_t jit_convert_emitter::get_inputs_num() const { return 1; }
void jit_convert_emitter::emit_data() const {
jit_emitter::emit_data();
if (emu_vcvtneps2bf16)
emu_vcvtneps2bf16->emit_data();
}
void jit_convert_emitter::float2bfloat(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
Zmm zmm_src = Zmm(in_vec_idxs[0]);
Zmm zmm_dst = Zmm(out_vec_idxs[0]);
if (mayiuse(avx512_core_bf16)) {
h->vcvtneps2bf16(zmm_dst, zmm_src);
} else {
if (!emu_vcvtneps2bf16)
IE_THROW() << "Converter from float to bf16 isn't initialized!";
emu_vcvtneps2bf16->emit_code({static_cast<size_t>(zmm_src.getIdx())}, {static_cast<size_t>(zmm_dst.getIdx())});
}
}
jit_convert_truncation_emitter::jit_convert_truncation_emitter(jit_generator *host, cpu_isa_t host_isa,
const std::shared_ptr<ngraph::Node>& node, Precision exec_prc)
: jit_convert_emitter(host, host_isa, node, exec_prc) {
prepare_table();
}
bool jit_convert_truncation_emitter::is_i8_and_u8_case() const {
return one_of(input_type, ov::element::i8, ov::element::u8) &&
one_of(output_type, ov::element::i8, ov::element::u8);
}
void jit_convert_truncation_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs,
const emitter_context *emit_context) const {
validate_types();
if (host_isa_ == cpu::x64::sse41) {
emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx2) {
emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx512_core) {
emit_isa<cpu::x64::avx512_core>(in_vec_idxs, out_vec_idxs);
} else {
assert(!"unsupported isa");
}
}
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
void jit_convert_truncation_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src = Vmm(in_vec_idxs[0]);
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
// For Truncation behavior we can just move data from src to dst if we want convert i8 -> u8 or u8 -> i8
if ((input_type == output_type) || is_i8_and_u8_case()) {
if (vmm_src != vmm_dst) {
h->uni_vmovups(vmm_dst, vmm_src);
}
return;
}
switch (input_type) {
case ov::element::f32:
if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8))
h->uni_vcvttps2dq(vmm_dst, vmm_src);
break;
case ov::element::i32:
if (one_of(output_type, ov::element::f32, ov::element::bf16))
h->uni_vcvtdq2ps(vmm_dst, vmm_src);
break;
case ov::element::bf16:
h->vpmovzxwd(vmm_dst, vmm_src);
h->uni_vpslld(vmm_dst, vmm_dst, 16);
if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8))
h->uni_vcvttps2dq(vmm_dst, vmm_dst);
break;
case ov::element::i8:
h->uni_vpmovsxbd(vmm_dst, vmm_src);
break;
case ov::element::u8:
h->uni_vpmovzxbd(vmm_dst, vmm_src);
break;
default:
assert(!"unsupported output data type");
}
switch (output_type) {
case ov::element::f32:
if (!one_of(input_type, ov::element::i32, ov::element::bf16)) {
h->uni_vcvtdq2ps(vmm_dst, vmm_dst);
}
break;
case ov::element::i32:
break;
case ov::element::bf16:
if (input_type == ov::element::f32) {
float2bfloat({static_cast<size_t>(vmm_src.getIdx())}, {static_cast<size_t>(vmm_dst.getIdx())});
} else {
if (one_of(input_type, ov::element::i8, ov::element::u8)) {
h->uni_vcvtdq2ps(vmm_dst, vmm_dst);
}
float2bfloat({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(vmm_dst.getIdx())});
}
break;
case ov::element::i8:
case ov::element::u8:
if (input_type == ov::element::i32) {
dword2int8<isa>({static_cast<size_t>(vmm_src.getIdx())}, {static_cast<size_t>(vmm_dst.getIdx())});
} else {
dword2int8<isa>({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(vmm_dst.getIdx())});
}
break;
default:
assert(!"unsupported output data type");
}
}
void jit_convert_truncation_emitter::register_table_entries() {
if (host_isa_ == dnnl::impl::cpu::x64::avx2 &&
one_of(output_type, ov::element::i8, ov::element::u8) &&
!is_i8_and_u8_case())
push_arg_entry_of("mask_byte", 0x000000ff, true);
}
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
void jit_convert_truncation_emitter::dword2int8(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src = Vmm(in_vec_idxs[0]);
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
Xmm xmm_dst = Xmm(out_vec_idxs[0]);
Ymm ymm_dst = Ymm(out_vec_idxs[0]);
if (isa == dnnl::impl::cpu::x64::avx512_core) {
h->vpmovdb(xmm_dst, vmm_src);
} else if (isa == dnnl::impl::cpu::x64::avx2) {
h->vpand(vmm_dst, vmm_src, table_val("mask_byte")); // to avoid saturation
h->uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst);
if (isa != dnnl::impl::cpu::x64::sse41)
h->vpermq(ymm_dst, ymm_dst, 0x08);
h->uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst);
}
}
jit_convert_saturation_emitter::jit_convert_saturation_emitter(jit_generator *host, cpu_isa_t host_isa,
const std::shared_ptr<ngraph::Node>& node, Precision exec_prc)
: jit_convert_emitter(host, host_isa, node, exec_prc) {
}
void jit_convert_saturation_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs,
const emitter_context *emit_context) const {
validate_types();
if (host_isa_ == cpu::x64::sse41) {
emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx2) {
emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx512_core) {
emit_isa<cpu::x64::avx512_core>(in_vec_idxs, out_vec_idxs);
} else {
assert(!"unsupported isa");
}
}
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
void jit_convert_saturation_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src = Vmm(in_vec_idxs[0]);
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
if (input_type == output_type) {
h->uni_vmovups(vmm_dst, vmm_src);
return;
}
switch (input_type) {
case ov::element::f32:
if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8))
h->uni_vcvtps2dq(vmm_dst, vmm_src);
break;
case ov::element::i32:
if (one_of(output_type, ov::element::f32, ov::element::bf16))
h->uni_vcvtdq2ps(vmm_dst, vmm_src);
break;
case ov::element::bf16:
h->vpmovzxwd(vmm_dst, vmm_src);
h->uni_vpslld(vmm_dst, vmm_dst, 16);
if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8))
h->uni_vcvttps2dq(vmm_dst, vmm_dst);
break;
case ov::element::i8:
h->uni_vpmovsxbd(vmm_dst, vmm_src);
break;
case ov::element::u8:
h->uni_vpmovzxbd(vmm_dst, vmm_src);
break;
default:
assert(!"unsupported output data type");
}
switch (output_type) {
case ov::element::f32:
if (!one_of(input_type, ov::element::i32, ov::element::bf16)) {
h->uni_vcvtdq2ps(vmm_dst, vmm_dst);
}
break;
case ov::element::i32:
break;
case ov::element::bf16:
if (input_type == ov::element::f32) {
float2bfloat({static_cast<size_t>(vmm_src.getIdx())}, {static_cast<size_t>(vmm_dst.getIdx())});
} else {
if (one_of(input_type, ov::element::i8, ov::element::u8)) {
h->uni_vcvtdq2ps(vmm_dst, vmm_dst);
}
float2bfloat({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(vmm_dst.getIdx())});
}
break;
case ov::element::i8:
case ov::element::u8:
if (input_type == ov::element::i32) {
dword2int8<isa>({static_cast<size_t>(vmm_src.getIdx())}, {static_cast<size_t>(vmm_dst.getIdx())}, output_type.is_signed());
} else {
dword2int8<isa>({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(vmm_dst.getIdx())}, output_type.is_signed());
}
break;
default:
assert(!"unsupported output data type");
}
}
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
void jit_convert_saturation_emitter::dword2int8(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs, bool is_signed) const {
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src = Vmm(in_vec_idxs[0]);
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
Xmm xmm_dst = Xmm(out_vec_idxs[0]);
Ymm ymm_dst = Ymm(out_vec_idxs[0]);
if (isa == dnnl::impl::cpu::x64::avx512_core) {
if (is_signed) {
h->vpmovsdb(xmm_dst, vmm_src);
} else {
Vmm vmm_zero = Vmm(aux_vec_idxs[0]);
h->vpxord(vmm_zero, vmm_zero, vmm_zero);
h->vpmaxsd(vmm_dst, vmm_src, vmm_zero);
h->vpmovusdb(xmm_dst, vmm_dst);
}
} else {
if (is_signed)
h->uni_vpackssdw(vmm_dst, vmm_src, vmm_src);
else
h->uni_vpackusdw(vmm_dst, vmm_src, vmm_src);
if (isa != dnnl::impl::cpu::x64::sse41)
h->vpermq(ymm_dst, ymm_dst, 0x08);
if (is_signed)
h->uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst);
else
h->uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst);
}
}
size_t jit_convert_saturation_emitter::aux_vecs_count() const {
// 1 register is for dword2int8 unsigned
return output_type == ov::element::u8 && host_isa_ == dnnl::impl::cpu::x64::avx512_core? 1 : 0;
}
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,87 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <cpu/x64/jit_generator.hpp>
#include "jit_emitter.hpp"
#include "jit_bf16_emitters.hpp"
namespace ov {
namespace intel_cpu {
class jit_convert_emitter : public jit_emitter {
public:
jit_convert_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
const std::shared_ptr<ngraph::Node>& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() const override;
protected:
void emit_data() const override;
void validate_types() const;
void float2bfloat(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
ov::element::Type input_type;
ov::element::Type output_type;
const ov::element::TypeVector supported_types = {
ov::element::f32,
ov::element::i32,
ov::element::bf16,
ov::element::i8,
ov::element::u8
};
std::shared_ptr<jit_emu_vcvtneps2bf16> emu_vcvtneps2bf16 = nullptr;
};
// This emitter is covered by specification of "Convert" operation. The implementation uses a "warp-around" conversion.
// Example:
// int32_t -> int8_t
// 129 -> -127
class jit_convert_truncation_emitter : public jit_convert_emitter {
public:
jit_convert_truncation_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
const std::shared_ptr<ngraph::Node>& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
private:
void emit_impl(const std::vector<size_t>& in, const std::vector<size_t>& out,
const std::vector<size_t>& pool, const std::vector<size_t>& gpr,
const ov::intel_cpu::emitter_context *emit_context) const override;
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
void dword2int8(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
bool is_i8_and_u8_case() const;
void register_table_entries() override;
};
// This emitter is covered by the common dnnl behavior. The implementation uses a "saturation" conversion.
// Example:
// int32_t -> int8_t
// 129 -> 127
class jit_convert_saturation_emitter : public jit_convert_emitter {
public:
jit_convert_saturation_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
const std::shared_ptr<ngraph::Node>& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
private:
void emit_impl(const std::vector<size_t>& in, const std::vector<size_t>& out,
const std::vector<size_t>& pool, const std::vector<size_t>& gpr,
const ov::intel_cpu::emitter_context *emit_context) const override;
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
void dword2int8(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs, bool is_signed) const;
size_t aux_vecs_count() const override;
};
} // namespace intel_cpu
} // namespace ov

View File

@ -46,6 +46,10 @@ size_t jit_emitter::aux_vecs_count() const {
return 0;
}
emitter_in_out_map jit_emitter::get_in_out_type() const {
return in_out_type_;
}
size_t jit_emitter::aux_gprs_count() const {
// We need one gpr to load table address
return entry_map_.empty() ? 0 : 1;

View File

@ -55,6 +55,7 @@ public:
const std::vector<size_t> &pool_vec_idxs = {}, const std::vector<size_t> &pool_gpr_idxs = {});
virtual size_t get_inputs_num() const = 0;
virtual size_t aux_vecs_count() const;
emitter_in_out_map get_in_out_type() const;
static std::set<InferenceEngine::Precision> get_supported_precisions();
protected:

View File

@ -547,8 +547,10 @@ void jit_load_emitter::register_table_entries() {
/// STORE ///
jit_store_emitter::jit_store_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
Precision src_prc, Precision dst_prc, int store_num, Precision exec_prc, emitter_in_out_map in_out_type)
: jit_emitter(host, host_isa, exec_prc, in_out_type), store_num_(store_num), src_prc_(src_prc), dst_prc_(dst_prc), name_("unknown") {
Precision src_prc, Precision dst_prc, int store_num, arithmetic_mode mode, Precision exec_prc,
emitter_in_out_map in_out_type)
: jit_emitter(host, host_isa, exec_prc, in_out_type), store_num_(store_num), src_prc_(src_prc), dst_prc_(dst_prc), mode_(mode), name_("unknown") {
prepare_table();
v_len_elt_ = get_vec_length() / exec_prc.size();
store_size_ = store_num * dst_prc.size();
if (!mayiuse(cpu::x64::avx512_core_bf16) && mayiuse(cpu::x64::avx512_core)) {
@ -556,9 +558,25 @@ jit_store_emitter::jit_store_emitter(dnnl::impl::cpu::x64::jit_generator *host,
}
}
// 0 for temp reg for mask store for avx512
inline bool jit_store_emitter::is_saturation() const {
return mode_ == arithmetic_mode::saturation;
}
// case for SSE and AVX2 when we should use AND to truncate values
inline bool jit_store_emitter::is_truncation_emulation() const {
return !mayiuse(cpu::x64::avx512_core) && !is_saturation() &&
src_prc_ != dst_prc_ && one_of(dst_prc_, Precision::U16, Precision::I16, Precision::U8, Precision::I8);
}
size_t jit_store_emitter::aux_gprs_count() const {
return get_aux_regs_for_avx512_mask(store_num_ * src_prc_.size());
// for temp reg for mask store
int count = get_aux_regs_for_avx512_mask(store_num_ * src_prc_.size());
// for table value in truncation arithmetic mode
if (is_truncation_emulation())
count++;
return count;
}
size_t jit_store_emitter::aux_vecs_count() const {
@ -580,6 +598,7 @@ size_t jit_store_emitter::aux_vecs_count() const {
size_t jit_store_emitter::get_inputs_num() const { return 1; }
void jit_store_emitter::emit_data() const {
jit_emitter::emit_data();
if (emu_vcvtneps2bf16_)
emu_vcvtneps2bf16_->emit_data();
}
@ -618,7 +637,11 @@ void jit_store_emitter::emit_isa(const int in_vec_idx, const Xbyak::Reg64 &reg_d
switch (src_prc_) {
case Precision::FP32:
if ((dst_prc_ != Precision::FP32) && (dst_prc_ != Precision::BF16)) {
h->uni_vcvtps2dq(Vmm(aux_vec_idxs.back()), Vmm(data_idx));
if (is_saturation()) {
h->uni_vcvtps2dq(Vmm(aux_vec_idxs.back()), Vmm(data_idx));
} else {
h->uni_vcvttps2dq(Vmm(aux_vec_idxs.back()), Vmm(data_idx));
}
data_idx = aux_vec_idxs.back();
}
break;
@ -804,7 +827,7 @@ void jit_store_emitter::store_bytes(const Vmm &vmm, const Xbyak::Reg64 &reg, int
/**
* store_dword_to_byte_extension is the utility function to
* 1. convert store_num (0 <= store_num <= 16) dwords in the Xmm/Ymm/Zmm to store_num bytes with singed or unsinged saturation.
* 1. convert store_num (0 <= store_num <= 16) dwords in the Xmm/Ymm/Zmm to store_num bytes with and without singed or unsinged saturation.
* 2. store the packed byte into the memory referenced by ptr[reg + offset] address.
*/
template <typename Vmm>
@ -835,28 +858,37 @@ void jit_store_emitter::store_dword_to_byte_extension(const Vmm &vmm, const Xbya
};
auto store_dword_to_byte_base = [&]() {
// db only available on avx512, need dw+wb to emulate
if (is_signed)
h->uni_vpackssdw(vmm, vmm, vmm);
else
h->uni_vpackusdw(vmm, vmm, vmm);
// gather 2(cross lane) 64 bits into lower vmm to store
// [y_3 y_2 y_1 y_0] |--> [y_0 y_0 y_2 y_0]
if (is_ymm) {
h->vpermq(ymm, ymm, 0x08); // 00001000
}
if (is_saturation()) {
// db only available on avx512, need dw+wb to emulate
if (is_signed)
h->uni_vpackssdw(vmm, vmm, vmm);
else
h->uni_vpackusdw(vmm, vmm, vmm);
// gather 2(cross lane) 64 bits into lower vmm to store
// [y_3 y_2 y_1 y_0] |--> [y_0 y_0 y_2 y_0]
if (is_ymm) {
h->vpermq(ymm, ymm, 0x08); // 00001000
}
if (is_signed)
h->uni_vpacksswb(vmm, vmm, vmm);
else
if (is_signed)
h->uni_vpacksswb(vmm, vmm, vmm);
else
h->uni_vpackuswb(vmm, vmm, vmm);
} else {
h->vpand(vmm, vmm, table_val("mask_truncation_byte")); // to avoid saturation
h->uni_vpackssdw(vmm, vmm, vmm);
if (is_ymm)
h->vpermq(ymm, ymm, 0x08);
h->uni_vpackuswb(vmm, vmm, vmm);
}
store_bytes(vmm, reg, offset, store_num);
};
switch (store_num) {
case 16:
// must support avx512F
case 16:
// must support avx512F
if (is_saturation()) {
if (is_signed) {
h->vpmovsdb(addr(0), vmm);
} else {
@ -865,9 +897,13 @@ void jit_store_emitter::store_dword_to_byte_extension(const Vmm &vmm, const Xbya
h->uni_vpmaxsd(vmm, vmm, zero);
h->vpmovusdb(addr(0), vmm);
}
break;
case 8:
if (mayiuse(cpu::x64::avx512_core)) { // ymm block on avx512F + VL
} else {
h->vpmovdb(addr(0), vmm);
}
break;
case 8:
if (mayiuse(cpu::x64::avx512_core)) {
if (is_saturation()) { // ymm block on avx512F + VL
if (is_signed) {
h->vpmovsdb(addr(0), ymm);
} else {
@ -877,11 +913,15 @@ void jit_store_emitter::store_dword_to_byte_extension(const Vmm &vmm, const Xbya
h->vpmovusdb(addr(0), ymm);
}
} else {
store_dword_to_byte_base();
h->vpmovdb(addr(0), ymm);
}
break;
case 4:
if (mayiuse(cpu::x64::avx512_core)) { // xmm block on avx512F + VL
} else {
store_dword_to_byte_base();
}
break;
case 4:
if (mayiuse(cpu::x64::avx512_core)) {
if (is_saturation()) {// xmm block on avx512F + VL
if (is_signed) {
h->vpmovsdb(addr(0), xmm);
} else {
@ -891,15 +931,19 @@ void jit_store_emitter::store_dword_to_byte_extension(const Vmm &vmm, const Xbya
h->vpmovusdb(addr(0), xmm);
}
} else {
store_dword_to_byte_base();
h->vpmovdb(addr(0), xmm);
}
break;
default:
if (is_zmm) { // avx512F
unsigned int mask = 1;
mask = (mask << store_num) - mask;
h->mov(Reg32(aux_gpr_idxs[0]), mask);
h->kmovw(k_mask, Reg32(aux_gpr_idxs[0]));
} else {
store_dword_to_byte_base();
}
break;
default:
if (is_zmm) { // avx512F
unsigned int mask = 1;
mask = (mask << store_num) - mask;
h->mov(Reg32(aux_gpr_idxs[0]), mask);
h->kmovw(k_mask, Reg32(aux_gpr_idxs[0]));
if (is_saturation()) {
if (is_signed) {
h->vpmovsdb(addr(0), vmm | k_mask);
} else {
@ -909,9 +953,12 @@ void jit_store_emitter::store_dword_to_byte_extension(const Vmm &vmm, const Xbya
h->vpmovusdb(addr(0), vmm | k_mask);
}
} else {
store_dword_to_byte_base();
h->vpmovdb(addr(0), vmm | k_mask);
}
break;
} else {
store_dword_to_byte_base();
}
break;
}
}
@ -946,16 +993,21 @@ void jit_store_emitter::store_dword_to_word_extension(const Vmm &vmm, const Xbya
auto zmm = Xbyak::Zmm(vmm.getIdx());
auto store_dword_to_word_base = [&]() {
// direct mov_dw available only on avx512, emulate with pack_dw + permute + pure store
if (is_signed)
h->uni_vpackssdw(vmm, vmm, vmm);
else
// direct mov_dw available only on avx512
if (is_saturation()) { // emulate with pack_dw + permute + pure store for saturation mode
if (is_signed)
h->uni_vpackssdw(vmm, vmm, vmm);
else
h->uni_vpackusdw(vmm, vmm, vmm);
// gather 2/4(cross lane) 64 bits into lower vmm to store
// [y_3 y_2 y_1 y_0] |--> [y_0 y_0 y_2 y_0]
// [ 128 | 128 ] |--> [ 128 | 128 ]
if (is_ymm) {
h->vpermq(ymm, ymm, 0x08); // 00001000
}
} else { // emulate with AND + pure store for truncation mode
h->vpand(vmm, vmm, table_val("mask_truncation_word"));
h->uni_vpackusdw(vmm, vmm, vmm);
// gather 2/4(cross lane) 64 bits into lower vmm to store
// [y_3 y_2 y_1 y_0] |--> [y_0 y_0 y_2 y_0]
// [ 128 | 128 ] |--> [ 128 | 128 ]
if (is_ymm) {
h->vpermq(ymm, ymm, 0x08); // 00001000
}
store_bytes(vmm, reg, offset, store_num * 2);
@ -978,7 +1030,8 @@ void jit_store_emitter::store_dword_to_word_extension(const Vmm &vmm, const Xbya
}
} else {
switch (store_num) {
case 16:
case 16:
if (is_saturation()) {
if (is_signed) {
h->vpmovsdw(ptr[reg + offset], vmm); // singed int32 saturate to signed int16.
} else {
@ -987,9 +1040,13 @@ void jit_store_emitter::store_dword_to_word_extension(const Vmm &vmm, const Xbya
h->uni_vpmaxsd(vmm, zero, vmm); // if singed bit is 1, set value as 0.
h->vpmovusdw(ptr[reg + offset], vmm); // unsinged int32 saturate to unsigned int16.
}
break;
case 8:
if (mayiuse(cpu::x64::avx512_core)) {
} else {
h->vpmovdw(ptr[reg + offset], vmm);
}
break;
case 8:
if (mayiuse(cpu::x64::avx512_core)) {
if (is_saturation()) {
if (is_signed) {
h->vpmovsdw(ptr[reg + offset], ymm);
} else {
@ -999,11 +1056,15 @@ void jit_store_emitter::store_dword_to_word_extension(const Vmm &vmm, const Xbya
h->vpmovusdw(ptr[reg + offset], ymm);
}
} else {
store_dword_to_word_base();
h->vpmovdw(ptr[reg + offset], ymm);
}
break;
case 4:
if (mayiuse(cpu::x64::avx512_core)) {
} else {
store_dword_to_word_base();
}
break;
case 4:
if (mayiuse(cpu::x64::avx512_core)) {
if (is_saturation()) {
if (is_signed) {
h->vpmovsdw(ptr[reg + offset], xmm);
} else {
@ -1013,15 +1074,19 @@ void jit_store_emitter::store_dword_to_word_extension(const Vmm &vmm, const Xbya
h->vpmovusdw(ptr[reg + offset], xmm);
}
} else {
store_dword_to_word_base();
h->vpmovdw(ptr[reg + offset], xmm);
}
break;
default:
if (is_zmm) {
unsigned int mask = 1;
mask = (mask << store_num) - mask;
h->mov(Reg32(aux_gpr_idxs[0]), mask);
h->kmovw(k_mask, Reg32(aux_gpr_idxs[0]));
} else {
store_dword_to_word_base();
}
break;
default:
if (is_zmm) {
unsigned int mask = 1;
mask = (mask << store_num) - mask;
h->mov(Reg32(aux_gpr_idxs[0]), mask);
h->kmovw(k_mask, Reg32(aux_gpr_idxs[0]));
if (is_saturation()) {
if (is_signed) {
h->vpmovsdw(ptr[reg + offset], vmm | k_mask);
} else {
@ -1031,12 +1096,22 @@ void jit_store_emitter::store_dword_to_word_extension(const Vmm &vmm, const Xbya
h->vpmovusdw(ptr[reg + offset], vmm | k_mask);
}
} else {
store_dword_to_word_base();
h->vpmovdw(ptr[reg + offset], vmm | k_mask);
}
break;
} else {
store_dword_to_word_base();
}
break;
}
}
}
void jit_store_emitter::register_table_entries() {
if (is_truncation_emulation()) {
push_arg_entry_of("mask_truncation_byte", 0x000000ff, true);
push_arg_entry_of("mask_truncation_word", 0x0000ffff, true);
}
}
} // namespace intel_cpu
} // namespace ov

View File

@ -39,6 +39,12 @@ struct store_emitter_params : public emitter_params {
int store_num_;
};
// Arithmetic modes for data type conversion in store_emitter
enum arithmetic_mode {
saturation,
truncation
};
class jit_load_emitter : public jit_emitter {
public:
jit_load_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, Precision src_prc, Precision dst_prc, int load_num,
@ -101,7 +107,8 @@ private:
class jit_store_emitter : public jit_emitter {
public:
jit_store_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, Precision src_prc, Precision dst_prc, int store_num,
Precision exec_prc = Precision::FP32, emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_gpr);
arithmetic_mode mode = arithmetic_mode::saturation, Precision exec_prc = Precision::FP32,
emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_gpr);
/**
* store_num values with src_prc in Vmm[in_vec_idx] is stored to ptr[reg_dst + offset_byte] address as dst_prc data, where offset_byte is in_idxs[1]
@ -143,15 +150,21 @@ private:
template <typename Vmm>
void store_dword_to_word_extension(const Vmm &vmm, const Xbyak::Reg64 &reg, int offset, bool is_bf16, bool is_signed, int store_size) const;
void register_table_entries() override;
size_t aux_gprs_count() const override;
size_t aux_vecs_count() const override;
inline bool is_saturation() const;
inline bool is_truncation_emulation() const;
std::string name_;
int v_len_elt_; // 4/8/16
int store_num_;
int store_size_;
Precision src_prc_;
Precision dst_prc_;
arithmetic_mode mode_ = arithmetic_mode::saturation;
std::shared_ptr<jit_emu_vcvtneps2bf16> emu_vcvtneps2bf16_;
};

View File

@ -0,0 +1,671 @@
// Copyright (C) 2020-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <ngraph/rt_info.hpp>
#include <ngraph/variant.hpp>
#include <cpu/x64/jit_generator.hpp>
#include "jit_snippets_emitters.hpp"
using namespace Xbyak;
namespace ov {
namespace intel_cpu {
inline static void transform_idxs_to_regs(const std::vector<size_t>& idxs, std::vector<Reg64>& regs) {
regs.resize(idxs.size());
std::transform(idxs.begin(), idxs.end(), regs.begin(), [](size_t idx){return Reg64(static_cast<int>(idx));});
}
jit_container_emitter::jit_container_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
const std::shared_ptr<ov::Node>& n) : jit_emitter(h, isa, n) {
in_out_type_ = emitter_in_out_map::gpr_to_gpr;
}
void jit_container_emitter::map_abstract_registers(const std::vector<size_t> &vec_pool, const std::vector<size_t> &gpr_pool,
std::set<size_t>& vecs_used, std::set<size_t>& gprs_used) {
if (body.empty())
IE_THROW() << "Cannot map registers for jit_container_emitter when its body is empty";
auto abstract_to_physical = [](const std::vector<size_t>& abstract_regs, const std::vector<size_t>& regs_pool) {
std::vector<size_t> physical_regs(abstract_regs.size());
for (size_t i = 0; i < abstract_regs.size(); i++)
physical_regs[i] = regs_pool.at(abstract_regs[i]);
return physical_regs;
};
for (auto& code : body) {
const auto& emitter = code.first;
std::vector<size_t> in_abstract_regs, out_abstract_regs;
std::tie(in_abstract_regs, out_abstract_regs) = code.second;
std::vector<size_t> in_physical_regs, out_physical_regs;
switch (std::dynamic_pointer_cast<jit_emitter>(emitter)->get_in_out_type()) {
case gpr_to_gpr:
// Note that gpr_to_gpr is used for high-level utility operations like Kernel/TileScheduler/Tile.
// Input registers are not mapped in this case, since they contain utility info
// (num_params, tile increment, etc.), but not reg indexes.
in_physical_regs = std::move(in_abstract_regs);
out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, gpr_pool));
gprs_used.insert(out_physical_regs.begin(), out_physical_regs.end());
break;
case gpr_to_vec:
// Load Emitters
in_physical_regs = std::move(abstract_to_physical(in_abstract_regs, gpr_pool));
out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, vec_pool));
gprs_used.insert(in_physical_regs.begin(), in_physical_regs.end());
vecs_used.insert(out_physical_regs.begin(), out_physical_regs.end());
break;
case vec_to_gpr:
// Store Emitters
in_physical_regs = std::move(abstract_to_physical(in_abstract_regs, vec_pool));
out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, gpr_pool));
vecs_used.insert(in_physical_regs.begin(), in_physical_regs.end());
gprs_used.insert(out_physical_regs.begin(), out_physical_regs.end());
break;
case vec_to_vec:
// Regular operations
in_physical_regs = std::move(abstract_to_physical(in_abstract_regs, vec_pool));
out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, vec_pool));
vecs_used.insert(in_physical_regs.begin(), in_physical_regs.end());
vecs_used.insert(out_physical_regs.begin(), out_physical_regs.end());
break;
default:
IE_THROW() << "Unhandled in_out type";
}
code.second = std::make_pair(in_physical_regs, out_physical_regs);
if (auto container = std::dynamic_pointer_cast<jit_container_emitter>(code.first))
container->map_abstract_registers(vec_pool, gpr_pool, vecs_used, gprs_used);
}
}
KernelEmitter::KernelEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
const std::shared_ptr<ov::Node>& n) : jit_container_emitter(h, isa, n) {
const auto kernel = ov::as_type_ptr<ngraph::snippets::op::Kernel>(n);
if (!kernel)
IE_THROW() << "KernelEmitter invoked with invalid op argument";
if (kernel->region.empty())
IE_THROW() << "KernelEmitter invoked with empty body";
body = kernel->region;
if (!kernel->compile_params)
IE_THROW() << "KernelEmitter invoked without compile_params";
jcp = *reinterpret_cast<const jit_snippets_compile_args*>(kernel->compile_params);
// Initialize pools of gp and vec registers
gp_regs_pool.resize(16);
vec_regs_pool.resize(16);
std::iota(gp_regs_pool.begin(), gp_regs_pool.end(), 0);
std::iota(vec_regs_pool.begin(), vec_regs_pool.end(), 0);
auto remove_regs_from_pool = [](std::vector<size_t>& pool, const std::set<size_t>& to_remove) {
// It's important to keep the order of other elements
pool.erase(std::remove_if(pool.begin(), pool.end(),
[&](size_t x) {return to_remove.count(x) != 0;}), pool.end());
};
// Reserve stack base and pointer for push(...) and pop(...) operations
// Reserve abi_param1 and abi_param2, since they'll be used to pass runtime call args to kernel
remove_regs_from_pool(gp_regs_pool, {Xbyak::Operand::RSP, Xbyak::Operand::RBP,
static_cast<size_t>(abi_param1.getIdx()),
static_cast<size_t>(abi_param2.getIdx())});
std::set<size_t> vecs_used, gprs_used;
map_abstract_registers(vec_regs_pool, gp_regs_pool, vecs_used, gprs_used);
remove_regs_from_pool(gp_regs_pool, gprs_used);
remove_regs_from_pool(vec_regs_pool, vecs_used);
// Remember used gprs to pass it to the TileSchedulerEmitter, so it can init them with appropriate data ptrs
gp_regs_used = std::vector<size_t>(gprs_used.begin(), gprs_used.end());
}
void KernelEmitter::emit_code(const std::vector<size_t> &in,
const std::vector<size_t> &out,
const std::vector<size_t> &pool,
const std::vector<size_t> &gpr) const {
validate_arguments(in, out, pool, gpr);
emit_impl(in, out, pool, gpr, nullptr);
}
void KernelEmitter::validate_arguments(const std::vector<size_t> &in,
const std::vector<size_t> &out,
const std::vector<size_t> &pool,
const std::vector<size_t> &gpr) const {
if (in.size() != 2)
IE_THROW() << "KernelEmitter got invalid number of inputs. Expected 2, got " << in.size();
if (!out.empty())
IE_THROW() << "KKernelEmitter got invalid number of outputs. Expected 0, got " << out.size();
}
void KernelEmitter::init_data_pointers(size_t num_inputs, size_t num_params,
const Reg64& reg_indexes, const Reg64& reg_const_params, const std::vector<Reg64>& data_ptr_regs) const {
const int64_t harness_num_dims = jcp.output_dims.size() - 1;
auto init_ptrs_with_offsets = [&](Reg64 pointer, const int64_t *offsets, Reg64 reg_tmp) {
for (int j = 0; j < harness_num_dims; j++) {
if (jcp.output_dims[j] != 1 && offsets[j] != 0) {
h->mov(reg_tmp, offsets[j]);
h->imul(reg_tmp, h->ptr[reg_indexes + j * sizeof(size_t)]);
h->add(pointer, reg_tmp);
}
}
};
for (auto i = 0; i < num_params; i++) {
if (i < num_inputs)
h->mov(data_ptr_regs[i], h->ptr[reg_const_params + GET_OFF(src_ptrs) + i * sizeof(void*)]);
else
h->mov(data_ptr_regs[i], h->ptr[reg_const_params + GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)]);
// we can use the last data_ptr_reg as tmp_reg until the last iteration, and reg_const_params then
Reg64 reg_tmp = i < num_params-1 ? data_ptr_regs.back() : reg_const_params;
init_ptrs_with_offsets(data_ptr_regs[i], &jcp.data_offsets[i * harness_num_dims], reg_tmp);
}
}
void KernelEmitter::emit_impl(const std::vector<size_t>& in,
const std::vector<size_t>& out,
const std::vector<size_t>& allocated_vec_regs,
const std::vector<size_t>& allocated_gp_regs,
const ov::intel_cpu::emitter_context *emit_context) const {
h->preamble();
const size_t num_inputs = in[0];
const size_t num_outputs = in[1];
Reg64 reg_indexes = Reg64(abi_param1.getIdx());
Reg64 reg_const_params = Reg64(abi_param2.getIdx());
std::vector<Reg64> data_ptr_regs;
transform_idxs_to_regs(gp_regs_used, data_ptr_regs);
init_data_pointers(num_inputs, num_inputs + num_outputs, reg_indexes, reg_const_params, data_ptr_regs);
// todo: emit_impl is a const method, so we can't just push_back unused regs to the gp_regs_pool.
// we need a more elegant approach to avoid a full copy here
auto local_gpr_pool = gp_regs_pool;
local_gpr_pool.push_back(static_cast<size_t>(reg_indexes.getIdx()));
local_gpr_pool.push_back(static_cast<size_t>(reg_const_params.getIdx()));
for (const auto& c : body) {
const auto& emitter = c.first;
std::vector<size_t> in_regs, out_regs;
std::tie(in_regs, out_regs) = c.second;
if (auto tile_scheduler = std::dynamic_pointer_cast<TileSchedulerEmitter>(emitter))
out_regs = gp_regs_used;
emitter->emit_code(in_regs, out_regs, vec_regs_pool, local_gpr_pool);
}
h->postamble();
}
TileSchedulerEmitter::TileSchedulerEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
const std::shared_ptr<ov::Node>& n) : jit_container_emitter(h, isa, n) {
const auto tile_scheduler = ov::as_type_ptr<ngraph::snippets::op::TileScheduler>(n);
if (!tile_scheduler)
IE_THROW() << "TileSchedulerEmitter invoked with invalid op argument";
if (!tile_scheduler->compile_params)
IE_THROW() << "TileEmitter invoked without compile_params";
body = {tile_scheduler->vector_region, tile_scheduler->scalar_region};
jcp = *reinterpret_cast<const jit_snippets_compile_args*>(tile_scheduler->compile_params);
}
void TileSchedulerEmitter::emit_code(const std::vector<size_t> &in,
const std::vector<size_t> &out,
const std::vector<size_t> &pool,
const std::vector<size_t> &gpr) const {
validate_arguments(in, out, pool, gpr);
emit_impl(in, out, pool, gpr, nullptr);
}
void TileSchedulerEmitter::validate_arguments(const std::vector<size_t> &in,
const std::vector<size_t> &out,
const std::vector<size_t> &pool,
const std::vector<size_t> &gpr) const {
if (in.size() != 3)
IE_THROW() << "TileSchedulerEmitter got invalid number of inputs. Expected 3, got " << in.size();
if (out.size() != in[0] + in[1])
IE_THROW() << "TileSchedulerEmitter got invalid number of outputs. Expected " << in[0] + in[1] << " , got " << out.size();
if (body.size() != 2)
IE_THROW() << "TileSchedulerEmitter got invalid body size, expected 2 (vector & scalar TileEmitter), got " << body.size();
if (!(std::dynamic_pointer_cast<TileEmitter>(body[0].first) && std::dynamic_pointer_cast<TileEmitter>(body[1].first)))
IE_THROW() << "TileSchedulerEmitter can contain only TileEmitters inside its body";
}
void TileSchedulerEmitter::emit_tiles(const Reg64& reg_inner_amount, const std::vector<Reg64>& data_ptr_regs, size_t vector_size,
const std::vector<size_t>& vec_pool, const std::vector<size_t>& gpr_pool) const {
// TileAllocatedEmitter is just an alias to perform dynamic_pointer_cast only once and reuse it below several times
using TileAllocatedEmitter = std::pair<std::shared_ptr<TileEmitter>, const ngraph::snippets::RegInfo&>;
TileAllocatedEmitter vector_tile {std::dynamic_pointer_cast<TileEmitter>(body[0].first), body[0].second};
TileAllocatedEmitter scalar_tile {std::dynamic_pointer_cast<TileEmitter>(body[1].first), body[1].second};
const size_t inner_work_amount = jcp.scheduler_dims[1];
auto process_tile =
[&](const bool evaluate_once, const TileAllocatedEmitter& tile) {
// If Tile is evaluated only once, then we can emit its body directly and skip work_amount decrements and checks
if (evaluate_once) {
tile.first->emit_body(vec_pool, gpr_pool);
} else {
std::vector<size_t> in_regs, out_regs;
std::tie(in_regs, out_regs) = tile.second;
// pass work_amount reg to Tile
in_regs.push_back(static_cast<size_t>(reg_inner_amount.getIdx()));
for (const auto& reg : data_ptr_regs)
out_regs.emplace_back(reg.getIdx());
tile.first->emit_code(in_regs, out_regs, vec_pool, gpr_pool);
}
};
// todo: these optimizations should be performed on using Tile graph representation in the future
bool vector_evaluate_once = false;
if (inner_work_amount >= vector_size) {
vector_evaluate_once = inner_work_amount < 2 * vector_size;
// Need to set proper work amount for inner tiles if evaluated multiple times
if (!vector_evaluate_once)
h->mov(reg_inner_amount, inner_work_amount);
process_tile(vector_evaluate_once, vector_tile);
}
if (inner_work_amount % vector_size >= 1) {
bool scalar_evaluate_once = inner_work_amount % vector_size < 2;
if (!scalar_evaluate_once) {
// vector_tile is not executed, work_amount is not set
if (inner_work_amount < vector_size) {
h->mov(reg_inner_amount, inner_work_amount);
// vector_tile is executed, but work_amount is neither set nor decremented appropriately.
} else if (vector_evaluate_once) {
vector_tile.first->emit_ptr_increments(data_ptr_regs);
h->mov(reg_inner_amount, inner_work_amount - vector_size);
}
// else: vector_tile is executed multiple times, so work_amount is already set
} else {
if (vector_evaluate_once) {
vector_tile.first->emit_ptr_increments(data_ptr_regs);
}
}
process_tile(scalar_evaluate_once, scalar_tile);
}
}
void TileSchedulerEmitter::emit_impl(const std::vector<size_t>& in,
const std::vector<size_t>& out,
const std::vector<size_t>& vec_pool,
const std::vector<size_t>& gpr_pool,
const ov::intel_cpu::emitter_context *emit_context) const {
const size_t num_inputs = in[0];
const size_t num_outputs = in[1];
const size_t vector_size = in[2];
const size_t num_params = num_inputs + num_outputs;
const auto& data_ptr_reg_idxs(out);
std::vector<Reg64> data_ptr_regs;
transform_idxs_to_regs(data_ptr_reg_idxs, data_ptr_regs);
// todo: emit_impl has const input args, so we can't just pop_back necessary regs from gpr_pool.
// we need a more elegant approach to avoid a full copy here. Similar problem is demonstrated in KernelEmitter
auto local_gpr_pool = gpr_pool;
Reg64 reg_outer_amount = Reg64(static_cast<int>(local_gpr_pool.back()));
local_gpr_pool.pop_back();
Reg64 reg_inner_amount = Reg64(static_cast<int>(local_gpr_pool.back()));
local_gpr_pool.pop_back();
Label for_body;
const size_t outer_work_amount = jcp.scheduler_dims[0];
if (outer_work_amount == 1) {
// emit code directly without looping over external dim
emit_tiles(reg_inner_amount, data_ptr_regs, vector_size, vec_pool, local_gpr_pool);
} else if (outer_work_amount > 1) {
// We need to create a Loop in this case
h->mov(reg_outer_amount, outer_work_amount);
h->L(for_body);
{
emit_tiles(reg_inner_amount, data_ptr_regs, vector_size, vec_pool, local_gpr_pool);
// Todo: Load and Store emitters are currently implemented so they ALWAYS increment appropriate pointers
// after reading/writing. This might be a problem if we need to read the same data multiple times (broadcasting shapes).
// To overcome this limitation, we add appropriate negative offsets if necessary.
for (auto i = 0; i < num_params; i++) {
if (jcp.scheduler_offsets[i] != 0) {
h->add(data_ptr_regs[i], jcp.scheduler_offsets[i]);
}
}
// Note that outer dimensions are always incremented by 1 (outer tiles are always scalar)
h->sub(reg_outer_amount, 1);
h->cmp(reg_outer_amount, 1);
h->jge(for_body, CodeGenerator::T_NEAR);
}
}
}
std::vector<AllocatedEmitter>& TileEmitter::get_nested_code() {
return body;
}
TileEmitter::TileEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
const std::shared_ptr<ov::Node>& n) : jit_container_emitter(h, isa, n) {
const auto tile = ov::as_type_ptr<ngraph::snippets::op::Tile>(n);
if (!tile)
IE_THROW() << "TileEmitter invoked with invalid op argument";
body = tile->region;
if (body.empty())
IE_THROW() << "TileEmitter is invoked with empty body";
num_inputs = tile->num_inputs;
num_outputs = tile->num_outputs;
io_dims = tile->io_dims;
io_data_size = tile->io_data_size;
increment = tile->increment;
if (io_dims.size() != num_inputs + num_outputs)
IE_THROW() << "TileEmitter constructor got inconsistent arguments. Check num_inputs + num_outputs == io_dims.size()";
}
void TileEmitter::emit_code(const std::vector<size_t> &in,
const std::vector<size_t> &out,
const std::vector<size_t> &pool,
const std::vector<size_t> &gpr) const {
validate_arguments(in, out, pool, gpr);
emit_impl(in, out, pool, gpr, nullptr);
}
void TileEmitter::validate_arguments(const std::vector<size_t> &in,
const std::vector<size_t> &out,
const std::vector<size_t> &pool,
const std::vector<size_t> &gpr) const {
if (in.size() != 1)
IE_THROW() << "TileEmitter got invalid number of inputs. Expected 1, got " << in.size();
if (out.size() != io_dims.size())
IE_THROW() << "TileEmitter got invalid number of outputs. Expected " << io_dims.size() << " , got " << out.size();
}
void TileEmitter::emit_body(const std::vector<size_t>& vec_pool, const std::vector<size_t>& gpr_pool) const {
for (auto& code : body)
code.first->emit_code(code.second.first, code.second.second, vec_pool, gpr_pool);
}
void TileEmitter::emit_ptr_increments(const std::vector<Reg64>& data_ptr_regs) const {
for (size_t i = 0; i < num_inputs + num_outputs; i++) {
// those with dims == 1 will be broadcasted, hence don't require increment
if (io_dims[i] != 1)
h->add(data_ptr_regs[i], increment * io_data_size[i]);
}
}
void TileEmitter::emit_impl(const std::vector<size_t>& in,
const std::vector<size_t>& out,
const std::vector<size_t>& vec_pool,
const std::vector<size_t>& gpr_pool,
const ov::intel_cpu::emitter_context *emit_context) const {
Reg64 work_amount = Reg64(static_cast<int>(in[0]));
std::vector<Reg64> data_ptr_regs;
transform_idxs_to_regs(out, data_ptr_regs);
Label for_body;
// Note that:
// * Work amount must be set by TileScheduler that executes Tiles
// * TileScheduler executes Tile only if it has to perform >= 1 iterations
h->L(for_body);
emit_body(vec_pool, gpr_pool);
emit_ptr_increments(data_ptr_regs);
h->sub(work_amount, increment);
h->cmp(work_amount, increment);
h->jge(for_body, CodeGenerator::T_NEAR);
}
BroadcastMoveEmitter::BroadcastMoveEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
const std::shared_ptr<ov::Node>& n) : jit_emitter(h, isa, n) {
if (n->get_input_shape(0).empty())
use_broadcast = true;
else if (*n->get_input_shape(0).rbegin() != *n->get_output_shape(0).rbegin())
use_broadcast = true;
else
use_broadcast = false;
if (n->get_input_element_type(0) != n->get_output_element_type(0))
IE_THROW() << "BroadcastMoveEmitter supports only equal input and output types but gets: "
<< n->get_input_element_type(0) << " and " << n->get_output_element_type(0);
byte_size = n->get_input_element_type(0).size();
}
void BroadcastMoveEmitter::emit_impl(const std::vector<size_t>& in,
const std::vector<size_t>& out,
const std::vector<size_t>& pool,
const std::vector<size_t>& gpr,
const ov::intel_cpu::emitter_context *emit_context) const {
if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
} else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
} else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
} else {
IE_THROW() << "BroadcastMove emitter doesn't support " << host_isa_;
}
}
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
void BroadcastMoveEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src0 = Vmm(in[0]);
Xmm xmm_src0 = Xmm(in[0]);
Vmm vmm_dst = Vmm(out[0]);
if (use_broadcast) {
switch (byte_size) {
case 4: h->uni_vbroadcastss(vmm_dst, xmm_src0); break;
case 2: h->vpbroadcastw(vmm_dst, xmm_src0); break;
case 1: h->vpbroadcastb(vmm_dst, xmm_src0); break;
default: assert(!"unsupported data type");
}
} else {
if (vmm_src0 != vmm_dst)
h->uni_vmovups(vmm_dst, vmm_src0);
}
}
ScalarEmitter::ScalarEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
const std::shared_ptr<ov::Node>& n) : jit_emitter(h, isa, n) {
value = dnnl::impl::cpu::x64::float2int(ov::as_type_ptr<ngraph::snippets::op::Scalar>(n)->cast_vector<float>()[0]);
push_arg_entry_of("scalar", value, true);
prepare_table();
}
void ScalarEmitter::emit_impl(const std::vector<size_t>& in,
const std::vector<size_t>& out,
const std::vector<size_t>& pool,
const std::vector<size_t>& gpr,
const ov::intel_cpu::emitter_context *emit_context) const {
if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
} else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
} else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
} else {
IE_THROW() << "Scalar emitter doesn't support " << host_isa_;
}
}
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
void ScalarEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_dst = Vmm(out[0]);
h->uni_vbroadcastss(vmm_dst, table_val("scalar"));
}
MemoryEmitter::MemoryEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
const std::shared_ptr<ov::Node>& n) : jit_emitter(h, isa, n) {
src_prc = InferenceEngine::details::convertPrecision(n->get_input_element_type(0));
dst_prc = InferenceEngine::details::convertPrecision(n->get_output_element_type(0));
}
StoreEmitter::StoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
const std::shared_ptr<ov::Node>& n) : MemoryEmitter(h, isa, n) {
if (src_prc != dst_prc)
IE_THROW() << "StoreEmitter supports only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name();
count = ov::as_type_ptr<ngraph::snippets::op::Store>(n)->get_count();
in_out_type_ = emitter_in_out_map::vec_to_gpr;
store_emitter.reset(new jit_store_emitter(h, isa, src_prc, dst_prc, count));
}
void StoreEmitter::emit_impl(const std::vector<size_t>& in,
const std::vector<size_t>& out,
const std::vector<size_t>& pool,
const std::vector<size_t>& gpr,
const ov::intel_cpu::emitter_context *emit_context) const {
if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
} else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
} else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
} else {
IE_THROW() << "Store emitter doesn't support " << host_isa_;
}
}
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
void StoreEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
if (!store_emitter)
IE_THROW() << "Store CPU emitter isn't initialized for StoreEmitter!";
store_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
}
void StoreEmitter::emit_data() const {
store_emitter->emit_data();
}
LoadEmitter::LoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
const std::shared_ptr<ov::Node>& n) : MemoryEmitter(h, isa, n) {
if (src_prc != dst_prc)
IE_THROW() << "LoadEmitter supports only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name();
count = ov::as_type_ptr<ngraph::snippets::op::Load>(n)->get_count();
in_out_type_ = emitter_in_out_map::gpr_to_vec;
load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count));
}
void LoadEmitter::emit_impl(const std::vector<size_t>& in,
const std::vector<size_t>& out,
const std::vector<size_t>& pool,
const std::vector<size_t>& gpr,
const ov::intel_cpu::emitter_context *emit_context) const {
if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
} else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
} else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
} else {
IE_THROW() << "Load emitter doesn't support " << host_isa_;
}
}
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
void LoadEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
if (!load_emitter)
IE_THROW() << "Load CPU emitter isn't initialized for LoadEmitter!";
load_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
}
void LoadEmitter::emit_data() const {
load_emitter->emit_data();
}
BroadcastLoadEmitter::BroadcastLoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
const std::shared_ptr<ov::Node>& n) : MemoryEmitter(h, isa, n) {
if (src_prc != dst_prc)
IE_THROW() << "BroadcastEmitters support only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name();
in_out_type_ = emitter_in_out_map::gpr_to_vec;
}
void BroadcastLoadEmitter::emit_impl(const std::vector<size_t>& in,
const std::vector<size_t>& out,
const std::vector<size_t>& pool,
const std::vector<size_t>& gpr,
const ov::intel_cpu::emitter_context *emit_context) const {
if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
} else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
} else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
} else {
IE_THROW() << "BroadcastLoad emitter doesn't support " << host_isa_;
}
}
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
void BroadcastLoadEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
Reg64 in_reg(in[0]);
Vmm vmm_dst = Vmm(out[0]);
// In doesn't really matter if we broadcast or `movss` for vector tails so keep only one version for `BroadcastLoad`,
// key point here is not to add post-increment, it might be fixed by some other approach in future
switch (src_prc.size()) {
case 4: h->uni_vbroadcastss(vmm_dst, h->ptr[in_reg]); break;
case 2: h->vpbroadcastw(vmm_dst, h->ptr[in_reg]); break;
case 1: h->vpbroadcastb(vmm_dst, h->ptr[in_reg]); break;
default: assert(!"unsupported data type");
}
}
LoadConvertEmitter::LoadConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
: MemoryEmitter(h, isa, n) {
count = ov::as_type_ptr<ngraph::snippets::op::Load>(n)->get_count();
in_out_type_ = emitter_in_out_map::gpr_to_vec;
load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count));
}
void LoadConvertEmitter::emit_impl(const std::vector<size_t>& in,
const std::vector<size_t>& out,
const std::vector<size_t>& pool,
const std::vector<size_t>& gpr,
const ov::intel_cpu::emitter_context *emit_context) const {
if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
} else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
} else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
} else {
IE_THROW() << "LoadConvert emitter doesn't support " << host_isa_;
}
}
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
void LoadConvertEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
if (!load_emitter)
IE_THROW() << "Load CPU emitter isn't initialized for LoadEmitter!";
load_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
}
void LoadConvertEmitter::emit_data() const {
load_emitter->emit_data();
}
StoreConvertEmitter::StoreConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
const std::shared_ptr<ov::Node>& n) : MemoryEmitter(h, isa, n) {
count = ov::as_type_ptr<ngraph::snippets::op::Store>(n)->get_count();
in_out_type_ = emitter_in_out_map::vec_to_gpr;
if (ov::is_type<ov::intel_cpu::StoreConvertTruncation>(n)) {
store_emitter.reset(new jit_store_emitter(h, isa, src_prc, dst_prc, count, arithmetic_mode::truncation));
} else if (ov::is_type<ov::intel_cpu::StoreConvertSaturation>(n)) {
store_emitter.reset(new jit_store_emitter(h, isa, src_prc, dst_prc, count, arithmetic_mode::saturation));
}
}
void StoreConvertEmitter::emit_impl(const std::vector<size_t>& in,
const std::vector<size_t>& out,
const std::vector<size_t>& pool,
const std::vector<size_t>& gpr,
const ov::intel_cpu::emitter_context *emit_context) const {
if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
} else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
} else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
} else {
IE_THROW() << "StoreConvert emitter doesn't support " << host_isa_;
}
}
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
void StoreConvertEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
if (!store_emitter)
IE_THROW() << "Store CPU emitter isn't initialized for StoreEmitter!";
store_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
}
void StoreConvertEmitter::emit_data() const {
store_emitter->emit_data();
}
} // namespace intel_cpu
} // namespace ov

View File

@ -6,15 +6,21 @@
#include <ngraph/rt_info.hpp>
#include <ngraph/variant.hpp>
#include <ie_ngraph_utils.hpp>
#include "jit_emitter.hpp"
#include "jit_load_store_emitters.hpp"
#include "snippets_transformations/op/store_convert.hpp"
using namespace Xbyak;
using ngraph::snippets::AllocatedEmitter;
namespace ov {
namespace intel_cpu {
#define SNIPPETS_MAX_SNIPPETS_DIMS 7
#define SNIPPETS_MAX_SNIPPETS_DIMS 12
#define SNIPPETS_MAX_HARNESS_DIMS 5
#define SNIPPETS_MAX_TILE_RANK 2
#define GET_OFF(field) offsetof(jit_snippets_call_args, field)
@ -30,11 +36,27 @@ struct jit_snippets_compile_args {
std::vector<size_t> output_dims = {};
};
///
/// \brief Kernel is the only entry point to Codogen Jit compilation. Kernel calculates appropriate data offsets,
/// and invokes enclosed outer Tiles. Only 2d Tiles are currently supported, so the emitters should
/// be organized in the following way:
/// KernelEmitter { /* entry point */
/// TileEmitter { /* outer tile */
/// \brief jit_container_emitter designed to wrap Emitters that contain other Emitters (presently KernelEmitter,
/// TileSchedulerEmitter and TileEmitter). This is needed to provide common interface for register mapping
/// (abstract to physical) and nested code access.
///
class jit_container_emitter: public jit_emitter {
public:
jit_container_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
const std::shared_ptr<ov::Node>& n);
protected:
// maps gpr and vec abstract registers to physical ones. Physical reg indexes are taken from the provided pools
// (the first 2 args). All the used gpr and vec registers are also stored in the provided sets (the second 2 args).
void map_abstract_registers(const std::vector<size_t>&, const std::vector<size_t>&,
std::set<size_t>&, std::set<size_t>&);
std::vector<AllocatedEmitter> body;
};
///
/// \brief Kernel is the only entry point to Codogen Jit compilation. Kernel perform abstract-to-physical register
/// mapping and creates pools of available gpr and vec registers. Kernel is expected to contain (at least one)
/// TileSchedulerEmitter. In general the enclosed emitters should be organized in the following way:
/// KernelEmitter { /* entry point, maps registers, creates pools of available registers */
/// TileSchedulerEmitter { /* executes required inner, avoids emitting code that won't be executed */
/// TileEmitter { /* inner vector tile */
/// ... /* All the necessary Load/Strore/elementwise emitters */
/// }
@ -43,255 +65,110 @@ struct jit_snippets_compile_args {
/// }
/// }
/// }
/// Note that Kernel params are passed directly to the emit_code(). The vector of inputs should contain 2 arguments, the
/// output vector should be empty. Input parameters
/// Note that Kernel doesn't accept any input arguments.
///
/// \param in[0] The number of the node inputs
/// \param in[1] The number of the node outputs
///
// Todo: Scheduler dims and offsets are currently calculated in Subgraph node and passed to the KernelEmitter.
// However, it seems more natural to calculate all the offsets right in the Kernel op, because the calculation is
// not device-specific. It is based only on input/output dims (which we already know) and harness num dims
// (which we should pass from the plugin). It seems also better to wrap the enclosed emitters in tiles in the Kernel op
// and avoid creating empty tiles.
class KernelEmitter : public jit_emitter {
class KernelEmitter : public jit_container_emitter {
public:
KernelEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
const std::shared_ptr<ov::Node>& n)
: jit_emitter(h, isa, n) {
const auto kernel = ov::as_type_ptr<ngraph::snippets::op::Kernel>(n);
if (!kernel)
IE_THROW() << "KernelEmitter invoked with invalid op argument";
if (!kernel->compile_params)
IE_THROW() << "KernelEmitter invoked without compile_params";
code = kernel->region;
jcp = *reinterpret_cast<const jit_snippets_compile_args*>(kernel->compile_params);
}
const std::shared_ptr<ov::Node>& n);
size_t get_inputs_num() const override {return 0;}
void emit_code(const std::vector<size_t> &in, const std::vector<size_t> &out,
const std::vector<size_t> &pool = {}, const std::vector<size_t> &gpr = {}) const override {
validate_arguments(in, out, pool, gpr);
emit_impl(in, out, pool, gpr, nullptr);
}
void emit_code(const std::vector<size_t> &in,
const std::vector<size_t> &out,
const std::vector<size_t> &pool,
const std::vector<size_t> &gpr) const override;
private:
void validate_arguments(const std::vector<size_t> &in, const std::vector<size_t> &out,
const std::vector<size_t> &pool = {}, const std::vector<size_t> &gpr = {}) const override {
if (in.size() != 2)
IE_THROW() << "KernelEmitter got invalid number of inputs. Expected 2, got " << in.size();
if (out.size() != 0)
IE_THROW() << "KernelEmitter got unexpected output arguments.";
const size_t num_params = in[0] + in[1];
if (num_params > SNIPPETS_MAX_SNIPPETS_DIMS)
IE_THROW() << "KernelEmitter supports only up to " << SNIPPETS_MAX_SNIPPETS_DIMS <<
" parameters, got " << num_params;
const int64_t harness_num_dims = jcp.output_dims.size() - 1;
if (harness_num_dims > SNIPPETS_MAX_HARNESS_DIMS)
IE_THROW() << "KernelEmitter supports harness with up to " << SNIPPETS_MAX_HARNESS_DIMS <<
" dims, got " << harness_num_dims;
}
void validate_arguments(const std::vector<size_t> &in,
const std::vector<size_t> &out,
const std::vector<size_t> &pool,
const std::vector<size_t> &gpr) const override;
void emit_impl(const std::vector<size_t>& in,
const std::vector<size_t>& out,
const std::vector<size_t>& pool,
const std::vector<size_t>& gpr,
const ov::intel_cpu::emitter_context *emit_context) const override {
const size_t num_inputs = in[0];
const size_t num_outputs = in[1];
const size_t num_params = num_inputs + num_outputs;
int reg64_tmp_start { 8 }; // R8, R9, R10, R11, R12, R13, R14, R15 inputs+outputs+1
const int64_t harness_num_dims = jcp.output_dims.size() - 1;
Reg64 reg_indexes { dnnl::impl::cpu::x64::abi_param_regs[0] };
Reg64 reg_const_params { dnnl::impl::cpu::x64::abi_param_regs[1] };
Xbyak::Reg64 reg_tmp_64 { dnnl::impl::cpu::x64::abi_not_param_reg };
h->preamble();
std::vector<Reg64> regs(num_params);
auto init_ptrs_with_offsets = [&](Reg64 pointer, const int64_t *offsets) {
for (int j = 0; j < harness_num_dims; j++) {
if (jcp.output_dims[j] != 1 && offsets[j] != 0) {
h->mov(reg_tmp_64, offsets[j]);
h->imul(reg_tmp_64, h->ptr[reg_indexes + j * sizeof(size_t)]);
h->add(pointer, reg_tmp_64);
}
}
};
for (auto i = 0; i < num_params; i++) {
regs[i] = Reg64(reg64_tmp_start + i);
if (i < num_inputs)
h->mov(regs[i], h->ptr[reg_const_params + GET_OFF(src_ptrs) + i * sizeof(void*)]);
else
h->mov(regs[i], h->ptr[reg_const_params + GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)]);
init_ptrs_with_offsets(regs[i], &jcp.data_offsets[i * harness_num_dims]);
}
for (auto& c : code) {
c.first->emit_code(c.second.first, c.second.second, pool, gpr);
}
h->postamble();
}
const ov::intel_cpu::emitter_context *emit_context) const override;
void init_data_pointers(size_t, size_t, const Reg64&, const Reg64&, const std::vector<Reg64>&) const;
jit_snippets_compile_args jcp;
std::vector<std::pair<std::shared_ptr<Emitter>, ngraph::snippets::RegInfo>> code;
std::vector<size_t> gp_regs_pool;
std::vector<size_t> gp_regs_used;
std::vector<size_t> vec_regs_pool;
};
///
/// \brief TileSchedulerEmitter contains Tiles to be executed (presently vector and scalar). It calculates data offsets
/// and work amounts, performs data pointer decrements if necessary. It also performs some Tile optimizations: scalar/vector
/// tiles are emitted only if necessary; Tile body could be emitted directly, if only one Tile evaluation is required.
///
/// \param in[0] The number of the node inputs
/// \param in[1] The number of the node outputs
/// \param in[2] The number of elements that fits into vector register
///
class TileSchedulerEmitter : public jit_container_emitter {
public:
TileSchedulerEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
const std::shared_ptr<ov::Node>& n);
size_t get_inputs_num() const override {return 0;}
void emit_code(const std::vector<size_t> &in,
const std::vector<size_t> &out,
const std::vector<size_t> &pool,
const std::vector<size_t> &gpr) const override;
private:
void validate_arguments(const std::vector<size_t> &in,
const std::vector<size_t> &out,
const std::vector<size_t> &pool,
const std::vector<size_t> &gpr) const override;
void emit_impl(const std::vector<size_t>& in,
const std::vector<size_t>& out,
const std::vector<size_t>& pool,
const std::vector<size_t>& gpr,
const ov::intel_cpu::emitter_context *emit_context) const override;
void emit_tiles(const Reg64&, const std::vector<Reg64>&, size_t, const std::vector<size_t>& , const std::vector<size_t>&) const;
jit_snippets_compile_args jcp;
};
///
/// \brief Tile is designed to organize loop over the input and output data. It is essentially a for(...) loop:
/// it calculates the total number of iterations, performs operations specified by enclosed emitters, advances iteration counters
/// it performs operations specified by enclosed emitters, advances iteration counters
/// and breaks when necessary.
///
/// \param in[0] The number of input entities (or scheduler counts) processed during one iteration of the tile.
/// It is expected to be 1 for outer or scalar tiles and vlen for vector tiles.
/// \param in[1] Increment of the previous Tile in current dimension. Must be 0 if this is the first Tile.
/// So previous_inc is zero for outer and vector tiles (the are the first in dim) and vlen for scalar tiles (they usually go after vector Tiles).
/// \param in[2] sum number inputs and number of outputs of the node.
/// \param in[3] dimension of the tile. Note that only 2d Tile are currently supported, so dim is 0 for outer tiles, 1 for inner tiles.
///
// Todo: Inner and outer tiles have different semantics. For example, outer tile always has the increment == 1, and it can contain only
// tile emitters (one outer or two inner). So it seems better to create different classes for inner and outer tiles.
// Todo: Currently data pointers incremented after each read/write in Load/Store emitters, so we have to decrement them here
// if the same data needs to be read twice. Better to move all the pointer increments to TileEmitter and avoid the increments if necessary.
class TileEmitter : public jit_emitter {
/// It is expected to be 1 for outer or scalar tiles and vlen for vector tiles.
class TileEmitter : public jit_container_emitter {
public:
TileEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
const std::shared_ptr<ov::Node>& n)
: jit_emitter(h, isa, n) {
const auto tile = ov::as_type_ptr<ngraph::snippets::op::Tile>(n);
if (!tile)
IE_THROW() << "TileEmitter invoked with invalid op argument";
if (!tile->compile_params)
IE_THROW() << "TileEmitter invoked without compile_params";
code = tile->region;
jcp = *reinterpret_cast<const jit_snippets_compile_args*>(tile->compile_params);
}
TileEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
size_t get_inputs_num() const override {return 0;}
std::vector<AllocatedEmitter>& get_nested_code();
void emit_code(const std::vector<size_t> &in,
const std::vector<size_t> &out,
const std::vector<size_t> &pool,
const std::vector<size_t> &gpr) const override;
void emit_code(const std::vector<size_t> &in, const std::vector<size_t> &out,
const std::vector<size_t> &pool = {}, const std::vector<size_t> &gpr = {}) const override {
validate_arguments(in, out, pool, gpr);
emit_impl(in, out, pool, gpr, nullptr);
}
void emit_body(const std::vector<size_t>& vec_pool, const std::vector<size_t>& gpr_pool) const;
void emit_ptr_increments(const std::vector<Reg64>& data_ptr_regs) const;
private:
void validate_arguments(const std::vector<size_t> &in, const std::vector<size_t> &out,
const std::vector<size_t> &pool = {}, const std::vector<size_t> &gpr = {}) const override {
if (in.size() != 4)
IE_THROW() << "TileEmitter got invalid number of inputs. Expected 4, got " << in.size();
if (out.size() != 0)
IE_THROW() << "TileEmitter got unexpected output arguments.";
const size_t num_params = in[2];
if (num_params > SNIPPETS_MAX_SNIPPETS_DIMS)
IE_THROW() << "TileEmitter supports only up to " << SNIPPETS_MAX_SNIPPETS_DIMS <<
" parameters, got " << num_params;
const size_t dim = in[3];
if (dim >= SNIPPETS_MAX_TILE_RANK)
IE_THROW() << "TileEmitter supports tile ranks up to " << SNIPPETS_MAX_TILE_RANK <<
" got " << dim;
}
void validate_arguments(const std::vector<size_t> &in,
const std::vector<size_t> &out,
const std::vector<size_t> &pool,
const std::vector<size_t> &gpr) const override;
void emit_impl(const std::vector<size_t>& in,
const std::vector<size_t>& out,
const std::vector<size_t>& pool,
const std::vector<size_t>& gpr,
const ov::intel_cpu::emitter_context *emit_context) const override {
const size_t inc = in[0];
const size_t previous_inc = in[1]; // increment of a previous tile in the same dim (0 if the first tile in the dim)
const size_t num_params = in[2];
const size_t dim = in[3]; // tile dimension: 0 - outer, 1 - inner
const int reg64_tmp_start { 8 }; // R8, R9, R10, R11, R12, R13, R14, R15 inputs+outputs+1
Reg64 amount = Reg64(reg64_tmp_start + num_params); // amount
std::array<Label, 2> for_body;
const ov::intel_cpu::emitter_context *emit_context) const override;
// If R15 is not used, reserve it for use in scalar to avoid redundant push-pop's.
// todo: Do we need explicitly check that code contains ScalarEmitter?
std::vector<size_t> local_gpr = reg64_tmp_start + num_params < 15 ? std::vector<size_t>{15} : std::vector<size_t>{};
std::vector<Reg64> regs(num_params);
for (auto i = 0; dim == 0 && i < num_params; i++)
regs[i] = Reg64(reg64_tmp_start + i);
// Loop processing could be simplified in some cases
if (inc > jcp.scheduler_dims[dim]) {
return;
} else if (inc == jcp.scheduler_dims[dim]) {
for (auto& c : code) {
c.first->emit_code(c.second.first, c.second.second, pool, local_gpr);
}
} else {
// The previous tile has done nothing, all the work is ours
if (previous_inc == 0 || previous_inc > jcp.scheduler_dims[dim]) {
h->mov(amount, jcp.scheduler_dims[dim]);
// The previous tile has done all the work
} else if (jcp.scheduler_dims[dim] % previous_inc == 0) {
return;
}// else: the previous tile has already set a proper work amount
h->cmp(amount, inc);
h->jl(for_body[0], CodeGenerator::T_NEAR);
h->L(for_body[1]);
{
h->push(amount);
for (auto& c : code) {
c.first->emit_code(c.second.first, c.second.second, pool, local_gpr);
}
h->pop(amount);
// Todo: Load and Store emitters are currently implemented so they ALWAYS increment appropriate pointers
// after reading/writing. This might be a problem if we need to read the same data multiple times (broadcasting shapes).
// To overcome this limitation, we add appropriate negative offsets if necessary.
for (auto i = 0; dim == 0 && i < num_params; i++) {
if (jcp.scheduler_offsets[i] != 0) {
h->add(regs[i], jcp.scheduler_offsets[i]);
}
}
h->sub(amount, inc);
h->cmp(amount, inc);
h->jge(for_body[1], CodeGenerator::T_NEAR);
}
h->L(for_body[0]);
}
}
// A = <42, 17>
// B = < 1, 17>
// for (auto k = 0; k < dom_0; k++) { // 42
// for (auto n = 0; n < dom_1; n++) { // 17
// auto a = *ptr0; ptr0 += vlan; // vector/scalar load
// auto b = *ptr1; ptr1 += vlan; // vector/scalar load
// }
// ptr0 -= 0*dom_1;
// ptr1 -= 1*dom_1;
// }
// broadcast by MVD is extra case
// A = <42, 17>
// B = <42, 1>
// for (auto k = 0; k < dom_0; k++) { // 42
// for (auto n = 0; n < dom_1; n++) { // 17
// auto a = *ptr0; ptr0 += vlan; // vector/scalar load
// auto b = *ptr1; // broadcast load
// }
// ptr0 -= 0*dom_1;
// ptr1 += sizeof(ptr1[0]); //ptr1 -= -sizeof(ptr1[0]);
// }
// A = <42, 17, 31>
// B = < 1, 17, 31>
// for (auto k = 0; k < dom_0; k++) { // 42
// for (auto n = 0; n < dom_1; n++) { // 17
// for (auto m = 0; m < dom_2; m++) { // 31
// auto a = *ptr0; ptr0 += vlan; // vector/scalar load
// auto b = *ptr1; ptr1 += vlan; // vector/scalar load
// }
// }
// ptr0 -= 0*dom_1*dom2;
// ptr1 -= 1*dom_1*dom2;
// }
jit_snippets_compile_args jcp;
std::vector<std::pair<std::shared_ptr<Emitter>, ngraph::snippets::RegInfo>> code;
size_t num_inputs = 0;
size_t num_outputs = 0;
std::vector<size_t> io_dims {};
std::vector<size_t> io_data_size {};
size_t increment = 0;
};
class NopEmitter : public jit_emitter {
@ -311,17 +188,10 @@ private:
}
};
class FakeBroadcastEmitter : public jit_emitter {
class BroadcastMoveEmitter : public jit_emitter {
public:
FakeBroadcastEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
: jit_emitter(h, isa, n) {
if (n->get_input_shape(0).empty())
use_broadcast = true;
else if (*n->get_input_shape(0).rbegin() != *n->get_output_shape(0).rbegin())
use_broadcast = true;
else
use_broadcast = false;
}
BroadcastMoveEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
size_t get_inputs_num() const override {return 1;}
private:
@ -329,45 +199,19 @@ private:
const std::vector<size_t>& out,
const std::vector<size_t>& pool,
const std::vector<size_t>& gpr,
const ov::intel_cpu::emitter_context *emit_context) const override {
if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
} else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
} else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
} else {
IE_THROW() << host_isa_;
assert(!"unsupported isa");
}
}
const ov::intel_cpu::emitter_context *emit_context) const override;
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src0 = Vmm(in[0]);
Vmm vmm_dst = Vmm(out[0]);
if (use_broadcast) {
h->uni_vbroadcastss(vmm_dst, Xmm(in[0]));
} else {
h->uni_vmovups(vmm_dst, vmm_src0);
}
}
void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
private:
bool use_broadcast;
size_t byte_size = 0lu;
};
class ScalarEmitter : public jit_emitter {
public:
ScalarEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
: jit_emitter(h, isa, n) {
value = dnnl::impl::cpu::x64::float2int(ov::as_type_ptr<ngraph::snippets::op::Scalar>(n)->cast_vector<float>()[0]);
push_arg_entry_of("scalar", value, true);
prepare_table();
}
ScalarEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
size_t get_inputs_num() const override {return 0;}
@ -379,26 +223,10 @@ private:
const std::vector<size_t>& out,
const std::vector<size_t>& pool,
const std::vector<size_t>& gpr,
const ov::intel_cpu::emitter_context *emit_context) const override {
if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
} else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
} else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
} else {
IE_THROW() << host_isa_;
assert(!"unsupported isa");
}
}
const ov::intel_cpu::emitter_context *emit_context) const override;
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_dst = Vmm(out[0]);
h->uni_vbroadcastss(vmm_dst, table_val("scalar"));
}
void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
private:
int32_t value;
@ -415,33 +243,16 @@ private:
/// Blocked parameter to tell if input is actually blocked. Broadcast means broadcast by W in other cases no need to substitute load.
class MemoryEmitter : public jit_emitter {
public:
MemoryEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
: jit_emitter(h, isa, n), ea(getEA(n)) {
}
size_t get_inputs_num() const override {return 1;}
MemoryEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
protected:
static auto getEA(const std::shared_ptr<ov::Node>& n) -> size_t {
auto& rt = n->get_rt_info();
size_t ea = 0;
auto it = rt.find("effectiveAddress");
if (it != rt.end()) {
ea = it->second.as<int64_t>();
} else {
throw ov::Exception("effective address for Load generation cannot be determined");
}
return ea;
}
size_t ea;
Precision src_prc;
Precision dst_prc;
};
class StoreEmitter : public MemoryEmitter {
public:
StoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
: MemoryEmitter(h, isa, n) {
}
StoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
size_t get_inputs_num() const override {return 1;}
@ -450,72 +261,20 @@ private:
const std::vector<size_t>& out,
const std::vector<size_t>& pool,
const std::vector<size_t>& gpr,
const ov::intel_cpu::emitter_context *emit_context) const override {
if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
} else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
} else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
} else {
IE_THROW() << host_isa_;
assert(!"unsupported isa");
}
}
const ov::intel_cpu::emitter_context *emit_context) const override;
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
Reg64 out_reg(ea);
Vmm vmm_src0 = Vmm(in[0]);
h->uni_vmovups(h->ptr[out_reg], vmm_src0);
h->add(out_reg, dnnl::impl::cpu::x64::cpu_isa_traits<isa>::vlen);
}
};
class ScalarStoreEmitter : public MemoryEmitter {
public:
ScalarStoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
: MemoryEmitter(h, isa, n) {
}
size_t get_inputs_num() const override {return 1;}
void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
void emit_data() const override;
private:
void emit_impl(const std::vector<size_t>& in,
const std::vector<size_t>& out,
const std::vector<size_t>& pool,
const std::vector<size_t>& gpr,
const ov::intel_cpu::emitter_context *emit_context) const override {
if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
} else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
} else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
} else {
IE_THROW() << host_isa_;
assert(!"unsupported isa");
}
}
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
Reg64 out_reg(ea);
Xmm vmm_src0 = Xmm(in[0]);
h->uni_vmovss(h->ptr[out_reg], vmm_src0);
h->add(out_reg, sizeof(float));
}
size_t count;
std::unique_ptr<jit_store_emitter> store_emitter = nullptr;
};
class LoadEmitter : public MemoryEmitter {
public:
LoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
: MemoryEmitter(h, isa, n), shouldPostIncrement(*n->get_input_shape(0).rbegin() != 1) {
}
LoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
size_t get_inputs_num() const override {return 0;}
@ -524,41 +283,21 @@ private:
const std::vector<size_t>& out,
const std::vector<size_t>& pool,
const std::vector<size_t>& gpr,
const ov::intel_cpu::emitter_context *emit_context) const override {
if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
} else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
} else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
} else {
IE_THROW() << host_isa_;
assert(!"unsupported isa");
}
}
const ov::intel_cpu::emitter_context *emit_context) const override;
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
Reg64 in_reg(ea);
Vmm vmm_src0 = Vmm(out[0]);
h->uni_vmovups(vmm_src0, h->ptr[in_reg]);
if (shouldPostIncrement) {
h->add(in_reg, dnnl::impl::cpu::x64::cpu_isa_traits<isa>::vlen);
}
}
void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
void emit_data() const override;
private:
bool shouldPostIncrement;
size_t count;
std::unique_ptr<jit_load_emitter> load_emitter = nullptr;
};
class BroadcastLoadEmitter : public MemoryEmitter {
public:
BroadcastLoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
: MemoryEmitter(h, isa, n) {
}
BroadcastLoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
size_t get_inputs_num() const override {return 0;}
private:
@ -566,73 +305,54 @@ private:
const std::vector<size_t>& out,
const std::vector<size_t>& pool,
const std::vector<size_t>& gpr,
const ov::intel_cpu::emitter_context *emit_context) const override {
if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
} else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
} else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
} else {
IE_THROW() << host_isa_;
assert(!"unsupported isa");
}
}
const ov::intel_cpu::emitter_context *emit_context) const override;
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
Reg64 in_reg(ea);
Vmm vmm_src0 = Vmm(out[0]);
// In doesn't really matter if we broadcast or `movss` for vector tails so keep only one version for `BroadcastLoad`,
// key point here is not to add post-increment, it might be fixed by some other approach in future
h->uni_vbroadcastss(vmm_src0, h->ptr[in_reg]);
}
void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
};
class ScalarLoadEmitter : public MemoryEmitter {
class LoadConvertEmitter : public MemoryEmitter {
public:
ScalarLoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
: MemoryEmitter(h, isa, n), shouldPostIncrement(*n->get_input_shape(0).rbegin() != 1) {
}
LoadConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
size_t get_inputs_num() const override {return 0;}
private:
void emit_impl(const std::vector<size_t>& in,
const std::vector<size_t>& out,
const std::vector<size_t>& pool,
const std::vector<size_t>& gpr,
const ov::intel_cpu::emitter_context *emit_context) const override {
if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
} else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
} else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
} else {
IE_THROW() << host_isa_;
assert(!"unsupported isa");
}
}
const std::vector<size_t>& out,
const std::vector<size_t>& pool,
const std::vector<size_t>& gpr,
const ov::intel_cpu::emitter_context *emit_context) const override;
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
Reg64 in_reg(ea);
Xmm vmm_src0 = Xmm(out[0]);
h->uni_vmovss(vmm_src0, h->ptr[in_reg]);
// Doesn't work if the same pointer comes with multiple load operations
if (shouldPostIncrement) {
h->add(in_reg, sizeof(float));
}
}
void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
void emit_data() const override;
private:
bool shouldPostIncrement;
size_t count;
std::unique_ptr<jit_load_emitter> load_emitter = nullptr;
};
class StoreConvertEmitter : public MemoryEmitter {
public:
StoreConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
size_t get_inputs_num() const override {return 1;}
private:
void emit_impl(const std::vector<size_t>& in,
const std::vector<size_t>& out,
const std::vector<size_t>& pool,
const std::vector<size_t>& gpr,
const ov::intel_cpu::emitter_context *emit_context) const override;
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
void emit_data() const override;
private:
size_t count;
std::unique_ptr<jit_store_emitter> store_emitter = nullptr;
};
} // namespace intel_cpu

View File

@ -7,6 +7,8 @@
#include "ngraph_transformations/op/leaky_relu.hpp"
#include "ngraph_transformations/op/power_static.hpp"
#include "ngraph_transformations/op/swish_cpu.hpp"
#include "snippets_transformations/op/load_convert.hpp"
#include "snippets_transformations/op/store_convert.hpp"
#include <ngraph/ngraph.hpp>
#include <ngraph_ops/augru_cell.hpp>
@ -42,6 +44,10 @@ std::map<std::string, ngraph::OpSet> Extension::getOpSets() {
NGRAPH_OP(LeakyReluNode, ov::intel_cpu)
NGRAPH_OP(PowerStaticNode, ov::intel_cpu)
NGRAPH_OP(SwishNode, ov::intel_cpu)
NGRAPH_OP(LoadConvertSaturation, ov::intel_cpu)
NGRAPH_OP(LoadConvertTruncation, ov::intel_cpu)
NGRAPH_OP(StoreConvertSaturation, ov::intel_cpu)
NGRAPH_OP(StoreConvertTruncation, ov::intel_cpu)
#undef NGRAPH_OP
return opset;

View File

@ -180,6 +180,39 @@ bool isSuitableMatMulParent(const std::shared_ptr<const Node> &node) {
const bool has_only_child = (out.size() == 1) && (out[0].get_target_inputs().size() == 1);
return is_suitable_node && has_only_child;
}
// Subtract as ZeroPoints for Convolution
bool isSuitableSubtractAsZeroPointsParent(const std::shared_ptr<const Node> &node) {
const bool is_suitable_node = ov::is_type<ngraph::op::v1::Subtract>(node);
const auto out = node->outputs();
const bool has_only_child = (out.size() == 1) && (out[0].get_target_inputs().size() == 1);
const bool has_two_parents = node->get_input_size() == 2;
if (!(is_suitable_node && has_only_child && has_two_parents))
return false;
const auto child = node->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
const bool is_conv = ov::is_type<ov::op::v1::Convolution>(child);
const bool is_group_conv = ov::is_type<ov::op::v1::GroupConvolution>(child);
if (!is_conv && !is_group_conv)
return false;
const auto weight_shape = child->get_input_shape(1);
const bool is_depthwise = is_group_conv && weight_shape[1] == 1 && weight_shape[2] == 1;
const bool deptwise_is_suitable = implication(is_depthwise, child->get_input_shape(0).size() < 5);
if (!(is_conv && deptwise_is_suitable))
return false;
const bool first_input_is_suitable = node->get_input_node_shared_ptr(0)->get_output_element_type(0) == ov::element::u8;
const auto zp_weights = node->get_input_node_shared_ptr(1);
const auto zp_weight_shape = zp_weights->get_output_shape(0);
bool second_input_is_suitable =
ov::is_type<ngraph::op::v0::Constant>(zp_weights) &&
zp_weights->get_output_element_type(0) == ov::element::u8 &&
zp_weight_shape.size() >= 2;
if (!(first_input_is_suitable && second_input_is_suitable))
return false;
auto correct_shape = ov::Shape(zp_weight_shape.size(), 1);
correct_shape[1] = zp_weight_shape[1];
return correct_shape == zp_weight_shape;
}
bool isSuitablePoolChild(const std::shared_ptr<const Node> &node) {
const bool is_suitable_node = ov::is_type<ngraph::op::v1::MaxPool>(node);
// has a single output, connected to a single child
@ -225,15 +258,40 @@ bool isSuitableChildForFusingMatMul(const std::shared_ptr<const Node> &node, Nod
// FuseMatMulAndSimpleOperation or FuseFullyConnectedAndSimpleOperation
// Invoke SupportsFusingWithConvolution_Simple directly instead of isSuitableChildForFusingSimple to
// eliminate getNumNonConstInputs() check
int fusingAxis;
if (can_be_converted_to_FC)
fusingAxis = matmul_shape.size() == 3 ? 2 : 1;
else
fusingAxis = matmul_shape.size() - 1;
int fusingAxis = can_be_converted_to_FC ? (matmul_shape.size() == 3 ? 2 : 1) : matmul_shape.size() - 1;
if (SupportsFusingWithConvolution_Simple(node, fusingAxis)) {
updatedChainType = NodeFusingType::FusedWithMisc;
return true;
}
// canFuse() from MatMul for case with rank > 2
// Algorithm::EltwisePowerStatic is ignored
if (!can_be_converted_to_FC &&
node->get_output_shape(0).size() > 2) {
if (ov::is_type<ov::op::v1::Add>(node) ||
ov::is_type<ov::op::v1::Multiply>(node) ||
ov::is_type<ov::op::v1::Subtract>(node) ||
ov::is_type<ov::op::v1::Divide>(node) ||
ov::is_type<ov::op::v0::PRelu>(node)) {
const auto const1 = ov::is_type<ov::op::v0::Constant>(node->get_input_node_shared_ptr(0));
const auto const2 = ov::is_type<ov::op::v0::Constant>(node->get_input_node_shared_ptr(1));
int constPort = -1;
if (const2) {
constPort = 1;
} else if (const1) {
constPort = 0;
}
if (constPort != -1) {
auto const_shape = node->get_input_shape(constPort);
if (ov::shape_size(const_shape) != 1) {
return false;
}
}
}
}
// FullyConnectedBiasFusion
if (!(can_be_converted_to_FC && ov::is_type<ngraph::opset1::Add>(node) &&
bias_shape.back() == matmul_shape.back() &&
@ -340,6 +398,9 @@ bool SnippetsMarkSkipped::run_on_model(const std::shared_ptr<ov::Model> &m) {
} else if (isSuitableMatMulParent(node)) {
SetNodeFusingType(node, NodeFusingType::FusedWithMatMul);
continue;
} else if (isSuitableSubtractAsZeroPointsParent(node)) {
SetSnippetsNodeType(node, snippets::pass::SnippetsNodeType::SkippedByPlugin);
continue;
}
for (const auto fusingChainType : getContinuableChains(node)) {
if (isSuitableChildForFusingSimple(node, channelAxis)) {

View File

@ -22,6 +22,7 @@
#include <snippets/op/subgraph.hpp>
#include "emitters/cpu_generator.hpp"
#include "snippets_transformations/fuse_load_store_and_convert.hpp"
using namespace InferenceEngine;
using namespace dnnl::impl::utils;
@ -60,7 +61,7 @@ void Snippet::initSupportedPrimitiveDescriptors() {
if (!supportedPrimitiveDescriptors.empty())
return;
const Precision supportedPrecision = Precision::FP32;
const std::set<Precision> supportedPrecisions = { Precision::FP32, Precision::I32, Precision::BF16, Precision::I8, Precision::U8 };
bool dimRanksAreEqual = true;
for (size_t i = 0; dimRanksAreEqual && i < inputShapes.size(); i++) {
@ -125,18 +126,29 @@ void Snippet::initSupportedPrimitiveDescriptors() {
config.dynBatchSupport = false;
config.inConfs.resize(inputShapes.size());
for (size_t i = 0; i < inputShapes.size(); i++) {
auto precision = getOriginalInputPrecisionAtPort(i);
if (supportedPrecisions.count(precision) == 0)
IE_THROW() << "Subgraph node with name `" << getName() << "` doesn't support " << precision << " precision.";
const auto equalPrecisions = getOriginalOutputPrecisions().size() == 1 &&
precision == getOriginalOutputPrecisionAtPort(0);
BlockedMemoryDesc::CmpMask inputMask = BLOCKED_DESC_SKIP_OFFSET_MASK;
PortConfig portConfig;
portConfig.inPlace((!i && canBeInPlace()) ? 0 : -1);
portConfig.inPlace((!i && canBeInPlace() && equalPrecisions) ? 0 : -1);
portConfig.constant(false);
if (inputShapes[i].getDims()[0] == 1) {
inputMask.reset(0); // accepts any stride on batch axis
}
portConfig.setMemDesc(createMemoryDesc(inputShapes[i], supportedPrecision, offset), inputMask);
portConfig.setMemDesc(createMemoryDesc(inputShapes[i], precision, offset), inputMask);
config.inConfs[i] = portConfig;
}
config.outConfs.resize(outputShapes.size());
for (size_t i = 0; i < outputShapes.size(); i++) {
auto precision = getOriginalOutputPrecisionAtPort(i);
if (supportedPrecisions.count(precision) == 0)
IE_THROW() << "Subgraph node with name `" << getName() << "` doesn't support " << precision << " precision.";
BlockedMemoryDesc::CmpMask outputMask = BLOCKED_DESC_SKIP_OFFSET_MASK;
PortConfig portConfig;
portConfig.inPlace(-1);
@ -144,7 +156,7 @@ void Snippet::initSupportedPrimitiveDescriptors() {
if (outputShapes[i].getDims()[0] == 1) {
outputMask.reset(0); // accepts any stride on batch axis
}
portConfig.setMemDesc(createMemoryDesc(outputShapes[i], supportedPrecision, offset), outputMask);
portConfig.setMemDesc(createMemoryDesc(outputShapes[i], precision, offset), outputMask);
config.outConfs[i] = portConfig;
}
@ -203,11 +215,27 @@ bool Snippet::created() const {
return getType() == Type::Subgraph;
}
InferenceEngine::Precision Snippet::getRuntimePrecision() const {
std::vector<InferenceEngine::Precision> inputPrecisions;
for (size_t i = 0; i < getParentEdges().size(); i++) {
auto parentEdge = getParentEdgeAt(i);
if (parentEdge && parentEdge->getStatus() == Edge::Status::Validated && !parentEdge->getParent()->isConstant()) {
inputPrecisions.emplace_back(DnnlExtensionUtils::DataTypeToIEPrecision((parentEdge->getMemoryPtr()->GetDataType())));
}
}
return getMaxPrecision(inputPrecisions);
}
bool Snippet::canBeInPlace() const {
if (getParentEdgesAtPort(0)[0]->getParent()->getType() == Type::Input) {
return false;
}
if (getChildEdges().size() != 1) {
return false;
}
for (auto& parentEdge : getParentEdges()) {
auto parent = parentEdge.lock()->getParent();
if (parent->getChildEdges().size() != 1)
@ -271,7 +299,9 @@ void Snippet::define_schedule() {
ngraph::snippets::op::Subgraph::BlockedShapeVector output_blocked_shapes;
for (size_t i = 0; i < outputShapes.size(); i++)
output_blocked_shapes.push_back(edgeToBlockedShape(getChildEdgesAtPort(i)[0]));
exec_domain = snippet->canonicalize(output_blocked_shapes, input_blocked_shapes);
// initialize by maximum output dimension. Dimensions of outputs should be broadcastable
tensorRank = std::max(static_cast<size_t>(rank6D), exec_domain.size());
// Canonicalization broadcasts inputs and outputs to max input rank, which can be smaller than tensorRank
@ -287,8 +317,7 @@ void Snippet::define_schedule() {
}
const auto config = getSelectedPrimitiveDescriptor()->getConfig();
const auto dataSize = config.inConfs[0].getMemDesc()->getPrecision().size();
auto initOffsets = [this, config, dataSize]() {
auto initOffsets = [this, config]() {
// find max rank input among all outputs
const size_t inputNum = getParentEdges().size();
offsets_in.resize(inputNum);
@ -296,7 +325,7 @@ void Snippet::define_schedule() {
offsets_in[i].resize(tensorRank, 1);
offset_calculation(offsets_in[i], dims_in[i], exec_domain);
for (size_t j = 0; j < tensorRank; j++) {
offsets_in[i][j] *= dataSize;
offsets_in[i][j] *= config.inConfs[i].getMemDesc()->getPrecision().size();
}
}
@ -305,7 +334,8 @@ void Snippet::define_schedule() {
for (size_t i = 0; i < inputNum; i++) {
const auto memPtr = getParentEdgeAt(i)->getMemoryPtr();
srcMemPtrs[i] = memPtr;
start_offset_in[i] = memPtr->GetDescWithType<BlockedMemoryDesc>()->getOffsetPadding() * dataSize;
start_offset_in[i] = memPtr->GetDescWithType<BlockedMemoryDesc>()->getOffsetPadding() *
config.inConfs[i].getMemDesc()->getPrecision().size();
}
const size_t outputNum = config.outConfs.size();
@ -314,7 +344,7 @@ void Snippet::define_schedule() {
offsets_out[i].resize(tensorRank, 1);
offset_calculation(offsets_out[i], dims_out[i], exec_domain);
for (size_t j = 0; j < tensorRank; j++) {
offsets_out[i][j] *= dataSize;
offsets_out[i][j] *= config.outConfs[i].getMemDesc()->getPrecision().size();
}
}
@ -323,7 +353,8 @@ void Snippet::define_schedule() {
for (size_t i = 0; i < outputNum; i++) {
const auto memPtr = getChildEdgeAt(i)->getMemoryPtr();
dstMemPtrs[i] = memPtr;
start_offset_out[i] = memPtr->GetDescWithType<BlockedMemoryDesc>()->getOffsetPadding() * dataSize;
start_offset_out[i] = memPtr->GetDescWithType<BlockedMemoryDesc>()->getOffsetPadding() *
config.outConfs[i].getMemDesc()->getPrecision().size();
}
};
@ -373,7 +404,7 @@ void Snippet::define_schedule() {
return collapsedDims;
};
auto initSchedulingInfo = [this, dataSize]() -> void {
auto initSchedulingInfo = [this, config]() -> void {
// initialize scheduling information
sch_offsets_in.resize(offsets_in.size(), 0);
sch_offsets_out.resize(offsets_out.size(), 0);
@ -385,19 +416,38 @@ void Snippet::define_schedule() {
schedulerWorkAmount /= exec_domain[tensorRank - 2];
exec_domain[tensorRank - 2] = 1;
// update offsets for tile 2D because loaders have ptr shifts in some cases and stores have always ptrs shifts
// update offsets for tile 2D because loaders and stores have ptr shifts in some cases
const int64_t vector_size = snippet->get_generator()->get_target_machine()->get_lanes();
for (size_t i = 0; i < offsets_in.size(); i++) {
int64_t offset = offsets_in[i][tensorRank - 2];
if ((offset > dataSize) || (offset == 0 && dims_in[i].back() != 1)) {
sch_offsets_in[i] = offset - exec_domain.back() * dataSize;
} else if (offset == dataSize) {
const int64_t offset = offsets_in[i][tensorRank - 2];
const int64_t data_size = config.inConfs[i].getMemDesc()->getPrecision().size();
if (offset == data_size || offset == vector_size * data_size) {
sch_offsets_in[i] = offset;
} else if ((offset > data_size) || (offset == 0 && dims_in[i].back() != 1 && dims_in[i].back() != vector_size)) {
sch_offsets_in[i] = offset - exec_domain.back() * data_size;
// If scalar tile executes one time, ptr doesn't move on 1 value
// so we should absolutelly decrease offset
if (exec_domain.back() % vector_size == 1) {
sch_offsets_in[i] += data_size;
}
}
}
for (size_t i = 0; i < offsets_out.size(); i++) {
int64_t offset = offsets_out[i][tensorRank - 2];
sch_offsets_out[i] = offset - exec_domain.back() * dataSize;
const int64_t offset = offsets_out[i][tensorRank - 2];
const size_t data_size = config.outConfs[i].getMemDesc()->getPrecision().size();
if (offset == data_size || offset == vector_size * data_size) {
sch_offsets_out[i] = offset;
} else if ((offset > data_size) || (offset == 0 && dims_out[i].back() != 1 && dims_out[i].back() != vector_size)) {
sch_offsets_out[i] = offset - exec_domain.back() * data_size;
// If scalar tile executes one time, ptr doesn't move on 1 value
// so we should absolutelly decrease offset
if (exec_domain.back() % vector_size == 1) {
sch_offsets_out[i] += data_size;
}
}
}
}
};
@ -434,7 +484,28 @@ void Snippet::generate() {
auto b = offsets_out[i].begin();
std::copy(b, b + harness_num_dims, &jcp.data_offsets[(inputShapes.size() + i) * harness_num_dims]);
}
schedule = snippet->generate(reinterpret_cast<void*>(&jcp));
ov::pass::Manager optManager;
optManager.register_pass<ov::intel_cpu::pass::FuseLoadConvert>();
optManager.register_pass<ov::intel_cpu::pass::FuseStoreConvert>();
// LoadConvert uses Load emitter that support conversion from any type to only f32
optManager.get_pass_config()->set_callback<ov::intel_cpu::pass::FuseLoadConvert>(
[](const std::shared_ptr<const ov::Node>& n) -> bool {
if (const auto& convert = std::dynamic_pointer_cast<const ov::op::v0::Convert>(n))
return convert->get_destination_type() != ov::element::f32;
return true;
});
// StoreConvert uses Store emitter that support conversion from only f32 to any types
optManager.get_pass_config()->set_callback<ov::intel_cpu::pass::FuseStoreConvert>(
[](const std::shared_ptr<const ov::Node>& n) -> bool {
if (const auto& convert = std::dynamic_pointer_cast<const ov::op::v0::Convert>(n))
return convert->get_input_element_type(0) != ov::element::f32;
return true;
});
schedule = snippet->generate(optManager, reinterpret_cast<void*>(&jcp));
}
void Snippet::schedule_6d(const jit_snippets_call_args& call_args) const {

View File

@ -30,6 +30,7 @@ public:
void getSupportedDescriptors() override {};
void initSupportedPrimitiveDescriptors() override;
void selectOptimalPrimitiveDescriptor() override;
InferenceEngine::Precision getRuntimePrecision() const override;
// Here we convert to canonical for & jit everything
void createPrimitive() override;

View File

@ -0,0 +1,117 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets/itt.hpp"
#include "fuse_load_store_and_convert.hpp"
#include "snippets/snippets_isa.hpp"
#include "snippets_transformations/op/load_convert.hpp"
#include "snippets_transformations/op/store_convert.hpp"
#include "ngraph/opsets/opset1.hpp"
#include "ngraph/rt_info.hpp"
#include "ngraph/pattern/op/wrap_type.hpp"
ov::intel_cpu::pass::FuseLoadConvert::FuseLoadConvert() {
MATCHER_SCOPE(FuseLoadConvert);
auto param_pattern = ngraph::pattern::wrap_type<ngraph::opset1::Parameter>();
auto load_pattern = ngraph::pattern::wrap_type<ngraph::snippets::op::Load>({param_pattern});
auto convert_pattern = ngraph::pattern::wrap_type<ngraph::opset1::Convert>({load_pattern});
auto callback = [=](ngraph::pattern::Matcher& m) {
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "ov::intel_cpu::pass::FuseLoadConvert")
auto& pm = m.get_pattern_value_map();
const auto param = pm.at(param_pattern).get_node_shared_ptr();
const auto load_shared = pm.at(load_pattern).get_node_shared_ptr();
if (!load_shared || load_shared->output(0).get_target_inputs().size() != 1) {
return false;
}
const auto load = std::dynamic_pointer_cast<ngraph::snippets::op::Load>(load_shared);
if (!load)
return false;
const auto convert = pm.at(convert_pattern).get_node_shared_ptr();
if (transformation_callback(convert))
return false;
std::shared_ptr<ngraph::Node> load_convert = nullptr;
if (const auto convert_saturation =
std::dynamic_pointer_cast<ngraph::snippets::op::ConvertSaturation>(convert)) {
load_convert = std::make_shared<ov::intel_cpu::LoadConvertSaturation>(param,
convert_saturation->get_destination_type(),
load->get_count());
} else if (const auto convert_truncation =
std::dynamic_pointer_cast<ngraph::snippets::op::ConvertTruncation>(convert)) {
load_convert = std::make_shared<ov::intel_cpu::LoadConvertTruncation>(param,
convert_truncation->get_destination_type(),
load->get_count());
} else {
throw ngraph::ngraph_error(
"Type of Convert op is undefined. Supports only fusing Load and ConvertTruncation or ConvertSaturation ops");
}
if (!load_convert)
return false;
ngraph::copy_runtime_info(convert, load_convert);
ngraph::replace_node(convert, load_convert);
return true;
};
auto m = std::make_shared<ngraph::pattern::Matcher>(convert_pattern, matcher_name);
register_matcher(m, callback);
}
ov::intel_cpu::pass::FuseStoreConvert::FuseStoreConvert() {
MATCHER_SCOPE(FuseStoreConvert);
auto input_pattern = ngraph::pattern::any_input();
auto convert_pattern = ngraph::pattern::wrap_type<ngraph::opset1::Convert>({input_pattern});
auto store_pattern = ngraph::pattern::wrap_type<ngraph::snippets::op::Store>({convert_pattern});
auto callback = [=](ngraph::pattern::Matcher& m) {
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "ov::intel_cpu::pass::FuseStoreConvert")
auto& pm = m.get_pattern_value_map();
const auto input = pm.at(input_pattern).get_node_shared_ptr();
const auto store = std::dynamic_pointer_cast<ngraph::snippets::op::Store>(pm.at(store_pattern).get_node_shared_ptr());
if (!store)
return false;
const auto convert = pm.at(convert_pattern).get_node_shared_ptr();
if (convert->output(0).get_target_inputs().size() != 1 || transformation_callback(convert))
return false;
std::shared_ptr<ngraph::Node> store_convert = nullptr;
if (const auto convert_saturation =
std::dynamic_pointer_cast<ngraph::snippets::op::ConvertSaturation>(convert)) {
store_convert = std::make_shared<ov::intel_cpu::StoreConvertSaturation>(input,
convert_saturation->get_destination_type(),
store->get_count());
} else if (const auto convert_truncation =
std::dynamic_pointer_cast<ngraph::snippets::op::ConvertTruncation>(convert)) {
store_convert = std::make_shared<ov::intel_cpu::StoreConvertTruncation>(input,
convert_truncation->get_destination_type(),
store->get_count());
} else {
throw ngraph::ngraph_error(
"Type of Convert op is undefined. Supports only fusing Store and ConvertTruncation or ConvertSaturation ops");
}
if (!store_convert)
return false;
ngraph::copy_runtime_info(store, store_convert);
ngraph::replace_node(store, store_convert);
return true;
};
auto m = std::make_shared<ngraph::pattern::Matcher>(store_pattern, matcher_name);
register_matcher(m, callback);
}

View File

@ -0,0 +1,40 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "ngraph/pass/graph_rewrite.hpp"
#include "ngraph/pattern/matcher.hpp"
namespace ov {
namespace intel_cpu {
namespace pass {
/**
* @interface FuseLoadConvert
* @brief Fuse Load and ConvertSaturation into one op LoadConvertSaturation
* Fuse Load and ConvertTruncation into one op LoadConvertTruncation
* @ingroup snippets
*/
class FuseLoadConvert: public ngraph::pass::MatcherPass {
public:
OPENVINO_RTTI("FuseLoadConvert", "0");
FuseLoadConvert();
};
/**
* @interface FuseStoreConvert
* @brief Fuse Store and ConvertSaturation into one op StoreConvertSaturation
* Fuse Store and ConvertTruncation into one op StoreConvertTruncation
* @ingroup snippets
*/
class FuseStoreConvert: public ngraph::pass::MatcherPass {
public:
OPENVINO_RTTI("FuseStoreConvert", "0");
FuseStoreConvert();
};
} // namespace pass
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,56 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets/itt.hpp"
#include "load_convert.hpp"
#include "ngraph/runtime/host_tensor.hpp"
using namespace std;
using namespace ov;
intel_cpu::LoadConvertSaturation::LoadConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count) :
Load(x, count), m_destination_type(destination_type) {
constructor_validate_and_infer_types();
}
bool intel_cpu::LoadConvertSaturation::visit_attributes(AttributeVisitor& visitor) {
INTERNAL_OP_SCOPE(LoadConvert_visit_attributes);
visitor.on_attribute("destination_type", m_destination_type);
return true;
}
void intel_cpu::LoadConvertSaturation::validate_and_infer_types() {
INTERNAL_OP_SCOPE(LoadConvert_validate_and_infer_types);
set_output_type(0, m_destination_type, get_input_partial_shape(0));
}
std::shared_ptr<Node> intel_cpu::LoadConvertSaturation::clone_with_new_inputs(const OutputVector& new_args) const {
INTERNAL_OP_SCOPE(LoadConvert_clone_with_new_inputs);
check_new_args_count(this, new_args);
return std::make_shared<LoadConvertSaturation>(new_args.at(0), m_destination_type, m_count);
}
intel_cpu::LoadConvertTruncation::LoadConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count) :
Load(x, count), m_destination_type(destination_type) {
constructor_validate_and_infer_types();
}
bool intel_cpu::LoadConvertTruncation::visit_attributes(AttributeVisitor& visitor) {
INTERNAL_OP_SCOPE(LoadConvert_visit_attributes);
visitor.on_attribute("destination_type", m_destination_type);
return true;
}
void intel_cpu::LoadConvertTruncation::validate_and_infer_types() {
INTERNAL_OP_SCOPE(LoadConvert_validate_and_infer_types);
set_output_type(0, m_destination_type, get_input_partial_shape(0));
}
std::shared_ptr<Node> intel_cpu::LoadConvertTruncation::clone_with_new_inputs(const OutputVector& new_args) const {
INTERNAL_OP_SCOPE(LoadConvert_clone_with_new_inputs);
check_new_args_count(this, new_args);
return std::make_shared<LoadConvertTruncation>(new_args.at(0), m_destination_type, m_count);
}

View File

@ -0,0 +1,68 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "ngraph/op/op.hpp"
#include "snippets/op/load.hpp"
namespace ov {
namespace intel_cpu {
/**
* @interface LoadConvertSaturation
* @brief Fused operation to represent computations equal to consecutive Load and ConvertSaturation operations.
* The operation is used for peephole optimization during subgraph lowering.
* @ingroup snippets
*/
class LoadConvertSaturation : public ngraph::snippets::op::Load {
public:
OPENVINO_OP("LoadConvertSaturation", "SnippetsOpset", ngraph::snippets::op::Load);
LoadConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count = 1lu);
LoadConvertSaturation() = default;
ov::element::Type get_destination_type() const { return m_destination_type; }
bool visit_attributes(AttributeVisitor& visitor) override;
void validate_and_infer_types() override;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
bool has_evaluate() const override { return false; }
protected:
ov::element::Type m_destination_type;
};
/**
* @interface LoadConvertTruncation
* @brief Fused operation to represent computations equal to consecutive Load and ConvertTruncation operations.
* The operation is used for peephole optimization during subgraph lowering.
* @ingroup snippets
*/
class LoadConvertTruncation : public ngraph::snippets::op::Load {
public:
OPENVINO_OP("LoadConvertTruncation", "SnippetsOpset", ngraph::snippets::op::Load);
LoadConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count = 1lu);
LoadConvertTruncation() = default;
ov::element::Type get_destination_type() const { return m_destination_type; }
bool visit_attributes(AttributeVisitor& visitor) override;
void validate_and_infer_types() override;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
bool has_evaluate() const override { return false; }
protected:
ov::element::Type m_destination_type;
};
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,56 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets/itt.hpp"
#include "store_convert.hpp"
#include "ngraph/runtime/host_tensor.hpp"
using namespace std;
using namespace ov;
intel_cpu::StoreConvertSaturation::StoreConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count) :
Store(x, count), m_destination_type(destination_type) {
constructor_validate_and_infer_types();
}
bool intel_cpu::StoreConvertSaturation::visit_attributes(AttributeVisitor& visitor) {
INTERNAL_OP_SCOPE(StoreConvert_visit_attributes);
visitor.on_attribute("destination_type", m_destination_type);
return true;
}
void intel_cpu::StoreConvertSaturation::validate_and_infer_types() {
INTERNAL_OP_SCOPE(StoreConvert_validate_and_infer_types);
set_output_type(0, m_destination_type, get_input_partial_shape(0));
}
std::shared_ptr<Node> intel_cpu::StoreConvertSaturation::clone_with_new_inputs(const OutputVector& new_args) const {
INTERNAL_OP_SCOPE(StoreConvert_clone_with_new_inputs);
check_new_args_count(this, new_args);
return std::make_shared<StoreConvertSaturation>(new_args.at(0), m_destination_type, m_count);
}
intel_cpu::StoreConvertTruncation::StoreConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count) :
Store(x, count), m_destination_type(destination_type) {
constructor_validate_and_infer_types();
}
bool intel_cpu::StoreConvertTruncation::visit_attributes(AttributeVisitor& visitor) {
INTERNAL_OP_SCOPE(StoreConvert_visit_attributes);
visitor.on_attribute("destination_type", m_destination_type);
return true;
}
void intel_cpu::StoreConvertTruncation::validate_and_infer_types() {
INTERNAL_OP_SCOPE(StoreConvert_validate_and_infer_types);
set_output_type(0, m_destination_type, get_input_partial_shape(0));
}
std::shared_ptr<Node> intel_cpu::StoreConvertTruncation::clone_with_new_inputs(const OutputVector& new_args) const {
INTERNAL_OP_SCOPE(StoreConvert_clone_with_new_inputs);
check_new_args_count(this, new_args);
return std::make_shared<StoreConvertTruncation>(new_args.at(0), m_destination_type, m_count);
}

View File

@ -0,0 +1,68 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "ngraph/op/op.hpp"
#include "snippets/op/store.hpp"
namespace ov {
namespace intel_cpu {
/**
* @interface StoreConvertSaturation
* @brief Fused operation to represent computations equal to consecutive Store and ConvertSaturation operations.
* The operation is used for peephole optimization during subgraph lowering.
* @ingroup snippets
*/
class StoreConvertSaturation : public ngraph::snippets::op::Store {
public:
OPENVINO_OP("StoreConvertSaturation", "SnippetsOpset", ngraph::snippets::op::Store);
StoreConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count = 1lu);
StoreConvertSaturation() = default;
ov::element::Type get_destination_type() const { return m_destination_type; }
bool visit_attributes(AttributeVisitor& visitor) override;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
void validate_and_infer_types() override;
bool has_evaluate() const override { return false; }
protected:
ov::element::Type m_destination_type;
};
/**
* @interface StoreConvertTruncation
* @brief Fused operation to represent computations equal to consecutive Store and ConvertTruncation operations.
* The operation is used for peephole optimization during subgraph lowering.
* @ingroup snippets
*/
class StoreConvertTruncation : public ngraph::snippets::op::Store {
public:
OPENVINO_OP("StoreConvertTruncation", "SnippetsOpset", ngraph::snippets::op::Store);
StoreConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count = 1lu);
StoreConvertTruncation() = default;
ov::element::Type get_destination_type() const { return m_destination_type; }
bool visit_attributes(AttributeVisitor& visitor) override;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
void validate_and_infer_types() override;
bool has_evaluate() const override { return false; }
protected:
ov::element::Type m_destination_type;
};
} // namespace intel_cpu
} // namespace ov

View File

@ -12,23 +12,31 @@ namespace snippets {
namespace {
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, Add,
::testing::Combine(
::testing::Values(ov::Shape {1, 42, 16, 64}),
::testing::Values(ov::Shape {1, 42, 16, 1}),
::testing::Values(1), // one node - Add
::testing::Values(0), // SnippetsMarkSkipped disables tokenization for eltwise chains after inputs
::testing::Values(CommonTestUtils::DEVICE_CPU)),
Add::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, Add,
::testing::Combine(
::testing::Values(ov::Shape {1, 42, 16, 64}),
::testing::Values(ov::Shape {1, 42, 16, 1}),
::testing::Values(1), // one node - Add
::testing::Values(0), // SnippetsMarkSkipped disables tokenization for eltwise chains after inputs
::testing::Values(CommonTestUtils::DEVICE_CPU)),
Add::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, AddSinh,
::testing::Combine(
::testing::Values(ov::Shape {1, 42, 16, 64}),
::testing::Values(ov::Shape {1, 42, 16, 1}),
::testing::Values(3), // Add + 2 converts after inputs
::testing::Values(1), // Subgraph is created, since the inputs are followed by converts
::testing::Values(CommonTestUtils::DEVICE_CPU)),
AddSinh::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, AddSinh,
::testing::Combine(
::testing::Values(ov::Shape {1, 42, 16, 64}),
::testing::Values(ov::Shape {1, 42, 16, 1}),
::testing::Values(3), // Add + 2 converts after inputs
::testing::Values(1), // Subgraph is created, since the inputs are followed by converts
::testing::Values(CommonTestUtils::DEVICE_CPU)),
AddSinh::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, AddSinhConst,
::testing::Combine(
::testing::Values(ov::Shape {1, 42, 16, 64}),
::testing::Values(2), // Add + 2 converts after inputs
::testing::Values(1), // Subgraph is created, since the inputs are followed by converts
::testing::Values(CommonTestUtils::DEVICE_CPU)),
AddSinhConst::getTestCaseName);
} // namespace
} // namespace snippets

View File

@ -0,0 +1,162 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets/convert.hpp"
#include "common_test_utils/test_constants.hpp"
namespace ov {
namespace test {
namespace snippets {
namespace {
const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>>> types_Convert = {
{ { ov::element::f32 }, { ov::element::i32 } },
{ { ov::element::f32 }, { ov::element::bf16 } },
{ { ov::element::f32 }, { ov::element::u8 } },
{ { ov::element::f32 }, { ov::element::i8 } },
{ { ov::element::bf16 }, { ov::element::f32 } },
{ { ov::element::bf16 }, { ov::element::i32 } },
{ { ov::element::bf16 }, { ov::element::i8 } },
{ { ov::element::bf16 }, { ov::element::u8 } },
{ { ov::element::i8 }, { ov::element::f32 } },
{ { ov::element::i8 }, { ov::element::i32 } },
{ { ov::element::i8 }, { ov::element::bf16 } },
{ { ov::element::i8 }, { ov::element::u8 } },
{ { ov::element::u8 }, { ov::element::f32 } },
{ { ov::element::u8 }, { ov::element::i32 } },
{ { ov::element::u8 }, { ov::element::bf16 } },
{ { ov::element::u8 }, { ov::element::i8 } },
};
const std::vector<std::vector<ov::Shape>> inputShapes_Convert = {
{ ov::Shape{2, 16} },
{ ov::Shape{5, 5} },
{ ov::Shape{2, 12, 1} }
};
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Convert, Convert,
::testing::Combine(
::testing::ValuesIn(inputShapes_Convert),
::testing::ValuesIn(types_Convert),
::testing::Values(2),
::testing::Values(1),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
Convert::getTestCaseName);
const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>>> types_ConvertInput = {
{ { ov::element::f32 }, { ov::element::i32 } },
{ { ov::element::f32 }, { ov::element::bf16 } },
{ { ov::element::bf16 }, { ov::element::f32 } },
{ { ov::element::i8 }, { ov::element::f32 } },
{ { ov::element::i8 }, { ov::element::i32 } },
{ { ov::element::i8 }, { ov::element::bf16 } },
{ { ov::element::u8 }, { ov::element::f32 } },
{ { ov::element::u8 }, { ov::element::i32 } },
{ { ov::element::u8 }, { ov::element::bf16 } },
};
const std::vector<std::vector<ov::Shape>> inputShapes_ConvertInput = {
{ ov::Shape{2, 16}, ov::Shape{1, 16} },
{ ov::Shape{5, 18}, ov::Shape{5, 1} },
{ ov::Shape{3, 1}, ov::Shape{3, 21} }
};
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertInput, ConvertInput,
::testing::Combine(
::testing::ValuesIn(inputShapes_ConvertInput),
::testing::ValuesIn(types_ConvertInput),
::testing::Values(3),
::testing::Values(1),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
Convert::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertOutput, ConvertOutput,
::testing::Combine(
::testing::ValuesIn(inputShapes_ConvertInput),
::testing::ValuesIn(types_ConvertInput),
::testing::Values(3),
::testing::Values(1),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
Convert::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertStub, ConvertStub,
::testing::Combine(
::testing::ValuesIn(inputShapes_ConvertInput),
::testing::ValuesIn(types_ConvertInput),
::testing::Values(4),
::testing::Values(2),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
Convert::getTestCaseName);
const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>>> types_ConvertPartialInputsAndResults = {
{ { ov::element::i8, ov::element::i32, ov::element::f32 }, { ov::element::f32, ov::element::i8 } },
{ { ov::element::bf16, ov::element::u8, ov::element::i32 }, { ov::element::i32, ov::element::bf16 } },
};
const std::vector<std::vector<ov::Shape>> inputShapes_ConvertPartialInputsAndResults = {
{ ov::Shape{2, 16}, ov::Shape{1, 16}, ov::Shape{1, 1} },
{ ov::Shape{5, 18}, ov::Shape{5, 1}, ov::Shape{1, 18} },
{ ov::Shape{3, 1}, ov::Shape{3, 21}, ov::Shape{3, 1} }
};
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertPartialInputsAndResults, ConvertPartialInputsAndResults,
::testing::Combine(
::testing::ValuesIn(inputShapes_ConvertPartialInputsAndResults),
::testing::ValuesIn(types_ConvertPartialInputsAndResults),
::testing::Values(6),
::testing::Values(1),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
Convert::getTestCaseName);
const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>>> types_ConvertMany = {
{ { ov::element::i32, ov::element::u8}, {} },
{ { ov::element::i32, ov::element::u8, ov::element::i32 }, {} },
{ { ov::element::i32, ov::element::f32, ov::element::i32, ov::element::i8 }, {} },
{ { ov::element::i32, ov::element::i8, ov::element::i32, ov::element::f32 }, {} },
};
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertManyOnInputs, ConvertManyOnInputs,
::testing::Combine(
::testing::Values(std::vector<ov::Shape>{ov::Shape{5, 5, 5, 5}}),
::testing::ValuesIn(types_ConvertMany),
::testing::Values(2),
::testing::Values(1),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
Convert::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertManyOnOutputs, ConvertManyOnOutputs,
::testing::Combine(
::testing::Values(std::vector<ov::Shape>{ov::Shape{5, 5, 5, 5}}),
::testing::ValuesIn(types_ConvertMany),
::testing::Values(5), // sinh + subgraph + reorders for sinh
::testing::Values(1),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
Convert::getTestCaseName);
const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>>> types_ConvertManyIO = {
{ { ov::element::i32, ov::element::u8}, {ov::element::i32} },
{ { ov::element::i32, ov::element::u8, ov::element::i32 }, { ov::element::i32, ov::element::i8, ov::element::i32, ov::element::f32 } },
};
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertManyOnInputOutput, ConvertManyOnInputOutput,
::testing::Combine(
::testing::Values(std::vector<ov::Shape>{ov::Shape{5, 5, 5, 5}}),
::testing::ValuesIn(types_ConvertManyIO),
::testing::Values(5), // sinh + subgraph + reorders for sinh
::testing::Values(1),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
Convert::getTestCaseName);
} // namespace
} // namespace snippets
} // namespace test
} // namespace ov

View File

@ -0,0 +1,25 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets/eltwise_two_results.hpp"
#include "common_test_utils/test_constants.hpp"
namespace ov {
namespace test {
namespace snippets {
namespace {
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, EltwiseTwoResults,
::testing::Combine(
::testing::Values(ov::Shape {1, 64, 10, 10}),
::testing::Values(ov::Shape {1, 64, 10, 1}),
::testing::Values(4),
::testing::Values(2),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
EltwiseTwoResults::getTestCaseName);
} // namespace
} // namespace snippets
} // namespace test
} // namespace ov

View File

@ -0,0 +1,26 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets/max_num_params_eltwise.hpp"
#include "common_test_utils/test_constants.hpp"
namespace ov {
namespace test {
namespace snippets {
namespace {
// Note that we need these shapes to cover all cases of code emission (none/one/multiple of scalar/vector tiles)
std::vector<ov::Shape> input_shapes {{1, 64, 10, 10}, {1, 1, 17, 37}, {1, 1, 1, 1}, {1, 1, 1, 7},
{1, 1, 1, 128}, {1, 1, 1, 14}, {1, 1, 1, 16}, {1, 1, 1, 30}};
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, MaxNumParamsEltwiseSinh,
::testing::Combine(
::testing::ValuesIn(input_shapes),
::testing::Values(12), // 10 Sinh after inputs + Subgraph + Concat
::testing::Values(1),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
MaxNumParamsEltwiseSinh::getTestCaseName);
} // namespace
} // namespace snippets
} // namespace test
} // namespace ov

View File

@ -10,25 +10,25 @@ namespace test {
namespace snippets {
namespace {
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, ThreeInputsEltwise,
::testing::Combine(
::testing::Values(ov::Shape {1, 64, 10, 10}),
::testing::Values(ov::Shape {1, 64, 10, 1}),
::testing::Values(ov::Shape {1, 1, 1, 10}),
::testing::Values(2), // eltwises fuse only for non-broadcasted shapes
::testing::Values(0), // SnippetsMarkSkipped disables tokenization for eltwise chains after inputs
::testing::Values(CommonTestUtils::DEVICE_CPU)),
ThreeInputsEltwise::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, ThreeInputsEltwise,
::testing::Combine(
::testing::Values(ov::Shape {1, 64, 10, 10}),
::testing::Values(ov::Shape {1, 64, 10, 1}),
::testing::Values(ov::Shape {1, 1, 1, 10}),
::testing::Values(2), // eltwises fuse only for non-broadcasted shapes
::testing::Values(0), // SnippetsMarkSkipped disables tokenization for eltwise chains after inputs
::testing::Values(CommonTestUtils::DEVICE_CPU)),
ThreeInputsEltwise::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, ThreeInputsEltwiseSinh,
::testing::Combine(
::testing::Values(ov::Shape {1, 64, 10, 10}),
::testing::Values(ov::Shape {1, 64, 10, 1}),
::testing::Values(ov::Shape {1, 1, 1, 10}),
::testing::Values(4), // Subgraph + 3 converts after inputs
::testing::Values(1), // Subgraph is created, since the inputs are followed by converts
::testing::Values(CommonTestUtils::DEVICE_CPU)),
ThreeInputsEltwiseSinh::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, ThreeInputsEltwiseSinh,
::testing::Combine(
::testing::Values(ov::Shape {1, 64, 10, 10}),
::testing::Values(ov::Shape {1, 64, 10, 1}),
::testing::Values(ov::Shape {1, 1, 1, 10}),
::testing::Values(4), // Subgraph + 3 converts after inputs
::testing::Values(1), // Subgraph is created, since the inputs are followed by converts
::testing::Values(CommonTestUtils::DEVICE_CPU)),
ThreeInputsEltwiseSinh::getTestCaseName);
} // namespace
} // namespace snippets

View File

@ -0,0 +1,45 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets/two_inputs_and_outputs.hpp"
#include "common_test_utils/test_constants.hpp"
namespace ov {
namespace test {
namespace snippets {
namespace {
const std::vector<std::vector<ov::Shape>> input_shapes = {
{ {5, 5, 256, 1}, {5, 5, 256, 1} },
{ {5, 5, 16, 35}, {5, 5, 16, 35} },
{ {5, 5, 256, 1}, {5, 5, 256, 35} },
{ {5, 5, 256, 1}, {5, 5, 1, 1} },
{ {5, 5, 16, 35}, {5, 5, 1, 1} },
{ {5, 5, 16, 35}, {5, 5, 16, 1} },
{ {5, 5, 5, 35}, {5, 5, 1, 35} },
{ {5, 5, 16, 1}, {5, 5, 1, 35} },
{ {5, 5, 35, 16}, {5, 5, 35, 16} },
{ {5, 5, 35, 16}, {5, 5, 1, 16} },
{ {5, 5, 35, 17}, {5, 5, 35, 17} },
{ {5, 5, 35, 17}, {5, 5, 1, 17} },
{ {5, 5, 35, 18}, {5, 5, 35, 18} },
{ {5, 5, 35, 18}, {5, 5, 1, 18} },
};
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, TwoInputsAndOutputs,
::testing::Combine(
::testing::ValuesIn(input_shapes),
::testing::Values(4),
::testing::Values(1),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
TwoInputsAndOutputs::getTestCaseName);
} // namespace
} // namespace snippets
} // namespace test
} // namespace ov

View File

@ -731,4 +731,4 @@ const auto params_5D_dyn_param = ::testing::Combine(
INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_5D_MemOrder_dyn_param, EltwiseLayerCPUTest, params_5D_dyn_param, EltwiseLayerCPUTest::getTestCaseName);
} // namespace
} // namespace CPULayerTestsDefinitions
} // namespace CPULayerTestsDefinitions

@ -1 +1 @@
Subproject commit 2a749c577f8a841a396d4bd46eaf311b7e7dc089
Subproject commit f9e363fc1ff47191c7ddea63b19c7893965a786a

View File

@ -18,6 +18,13 @@ typedef std::tuple<
std::string // Target Device
> AddParams;
typedef std::tuple<
ov::Shape, // Input 0 Shape
size_t, // Expected num nodes
size_t, // Expected num subgraphs
std::string // Target Device
> AddConstParams;
class Add : public testing::WithParamInterface<ov::test::snippets::AddParams>,
virtual public ov::test::SnippetsTestsCommon {
public:
@ -32,6 +39,14 @@ protected:
void SetUp() override;
};
class AddSinhConst : public testing::WithParamInterface<ov::test::snippets::AddConstParams>,
virtual public ov::test::SnippetsTestsCommon {
public:
static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::AddConstParams> obj);
protected:
void SetUp() override;
};
} // namespace snippets
} // namespace test
} // namespace ov

View File

@ -0,0 +1,76 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "shared_test_classes/base/snippets_test_utils.hpp"
namespace ov {
namespace test {
namespace snippets {
typedef std::tuple<
std::vector<ov::Shape>, // InputShapes
std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>>, // Input and Output data types for Converts
size_t, // Expected num nodes
size_t, // Expected num subgraphs
std::string // Target Device
> ConvertParams;
using parameters = std::vector<std::tuple<int32_t, int32_t, int32_t>>;
class Convert : public testing::WithParamInterface<ov::test::snippets::ConvertParams>,
virtual public ov::test::SnippetsTestsCommon {
public:
static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::ConvertParams> obj);
protected:
void SetUp() override;
void generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) override;
virtual parameters generate_params_random() const;
ov::element::Type output_type = ov::element::f32;
};
class ConvertInput : public Convert {
protected:
void SetUp() override;
parameters generate_params_random() const override;
};
class ConvertOutput : public ConvertInput {
protected:
void SetUp() override;
};
class ConvertStub : public ConvertInput {
protected:
void SetUp() override;
};
class ConvertPartialInputsAndResults : public ConvertInput {
protected:
void SetUp() override;
};
class ConvertManyOnInputs : public ConvertInput {
protected:
void SetUp() override;
};
class ConvertManyOnOutputs : public ConvertInput {
protected:
void SetUp() override;
};
class ConvertManyOnInputOutput : public ConvertInput {
protected:
void SetUp() override;
};
} // namespace snippets
} // namespace test
} // namespace ov

View File

@ -0,0 +1,33 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "shared_test_classes/base/snippets_test_utils.hpp"
namespace ov {
namespace test {
namespace snippets {
typedef std::tuple<
ov::Shape, // Input 0 Shape
ov::Shape, // Input 1 Shape
size_t, // Expected num nodes
size_t, // Expected num subgraphs
std::string // Target Device
> EltwiseTwoResultsParams;
class EltwiseTwoResults : public testing::WithParamInterface<ov::test::snippets::EltwiseTwoResultsParams>,
virtual public ov::test::SnippetsTestsCommon {
public:
static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::EltwiseTwoResultsParams> obj);
protected:
void SetUp() override;
};
} // namespace snippets
} // namespace test
} // namespace ov

View File

@ -0,0 +1,31 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "shared_test_classes/base/snippets_test_utils.hpp"
namespace ov {
namespace test {
namespace snippets {
typedef std::tuple<
ov::Shape, // Input Shape All shapes are replicated
size_t, // Expected num nodes
size_t, // Expected num subgraphs
std::string // Target Device
> MaxNumParamsEltwiseParams;
class MaxNumParamsEltwiseSinh : public testing::WithParamInterface<ov::test::snippets::MaxNumParamsEltwiseParams>,
virtual public ov::test::SnippetsTestsCommon {
public:
static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::MaxNumParamsEltwiseParams> obj);
protected:
void SetUp() override;
};
} // namespace snippets
} // namespace test
} // namespace ov

View File

@ -0,0 +1,31 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "shared_test_classes/base/snippets_test_utils.hpp"
namespace ov {
namespace test {
namespace snippets {
typedef std::tuple<
std::vector<ov::Shape>, // Input Shape All shapes
size_t, // Expected num nodes
size_t, // Expected num subgraphs
std::string // Target Device
> TwoInputsAndOutputsParams;
class TwoInputsAndOutputs : public testing::WithParamInterface<ov::test::snippets::TwoInputsAndOutputsParams>,
virtual public ov::test::SnippetsTestsCommon {
public:
static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::TwoInputsAndOutputsParams> obj);
protected:
void SetUp() override;
};
} // namespace snippets
} // namespace test
} // namespace ov

View File

@ -10,38 +10,61 @@ namespace ov {
namespace test {
namespace snippets {
std::string Add::getTestCaseName(testing::TestParamInfo<ov::test::snippets::AddParams> obj) {
ov::Shape inputShapes0, inputShapes1, newInputShapes;
std::string targetDevice;
size_t num_nodes, num_subgraphs;
std::tie(inputShapes0, inputShapes1, num_nodes, num_subgraphs, targetDevice) = obj.param;
std::string Add::getTestCaseName(testing::TestParamInfo<ov::test::snippets::AddParams> obj) {
ov::Shape inputShapes0, inputShapes1, newInputShapes;
std::string targetDevice;
size_t num_nodes, num_subgraphs;
std::tie(inputShapes0, inputShapes1, num_nodes, num_subgraphs, targetDevice) = obj.param;
std::ostringstream result;
result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes0) << "_";
result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes1) << "_";
result << "#N=" << num_nodes << "_";
result << "#S=" << num_subgraphs << "_";
result << "targetDevice=" << targetDevice;
return result.str();
}
std::ostringstream result;
result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes0) << "_";
result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes1) << "_";
result << "#N=" << num_nodes << "_";
result << "#S=" << num_subgraphs << "_";
result << "targetDevice=" << targetDevice;
return result.str();
}
void Add::SetUp() {
ov::Shape inputShape0, inputShape1;
std::tie(inputShape0, inputShape1, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}});
void Add::SetUp() {
ov::Shape inputShape0, inputShape1;
std::tie(inputShape0, inputShape1, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}});
auto f = ov::test::snippets::AddFunction({inputShape0, inputShape1});
function = f.getOriginal();
}
auto f = ov::test::snippets::AddFunction({inputShape0, inputShape1});
function = f.getOriginal();
}
void AddSinh::SetUp() {
ov::Shape inputShape0, inputShape1;
std::tie(inputShape0, inputShape1, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}});
void AddSinh::SetUp() {
ov::Shape inputShape0, inputShape1;
std::tie(inputShape0, inputShape1, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}});
auto f = ov::test::snippets::AddSinhFunction({inputShape0, inputShape1});
function = f.getOriginal();
}
auto f = ov::test::snippets::AddSinhFunction({inputShape0, inputShape1});
function = f.getOriginal();
}
std::string AddSinhConst::getTestCaseName(testing::TestParamInfo<ov::test::snippets::AddConstParams> obj) {
ov::Shape inputShapes, newInputShapes;
std::string targetDevice;
size_t num_nodes, num_subgraphs;
std::tie(inputShapes, num_nodes, num_subgraphs, targetDevice) = obj.param;
std::ostringstream result;
result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes) << "_";
result << "#N=" << num_nodes << "_";
result << "#S=" << num_subgraphs << "_";
result << "targetDevice=" << targetDevice;
return result.str();
}
void AddSinhConst::SetUp() {
ov::Shape inputShape;
std::tie(inputShape, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
init_input_shapes({{{}, {inputShape, }}});
auto f = ov::test::snippets::AddSinhConstFunction({inputShape});
function = f.getOriginal();
}
TEST_P(Add, CompareWithRefImpl) {
run();
@ -53,6 +76,11 @@ TEST_P(AddSinh, CompareWithRefImpl) {
validateNumSubgraphs();
}
TEST_P(AddSinhConst, CompareWithRefImpl) {
run();
validateNumSubgraphs();
}
} // namespace snippets
} // namespace test
} // namespace ov

View File

@ -0,0 +1,231 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "common_test_utils/common_utils.hpp"
#include "snippets/convert.hpp"
#include "subgraph_converts.hpp"
#include "common_test_utils/ov_tensor_utils.hpp"
namespace ov {
namespace test {
namespace snippets {
std::string Convert::getTestCaseName(testing::TestParamInfo<ov::test::snippets::ConvertParams> obj) {
std::vector<ov::Shape> inputShape;
std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
std::string targetDevice;
size_t num_nodes, num_subgraphs;
std::tie(inputShape, types, num_nodes, num_subgraphs, targetDevice) = obj.param;
std::ostringstream result;
result << "IS=";
for (const auto& sh : inputShape)
result << CommonTestUtils::vec2str(sh) << "_";
result << "IT=" << CommonTestUtils::vec2str(types.first) << "_";
result << "OT=" << CommonTestUtils::vec2str(types.second) << "_";
result << "#N=" << num_nodes << "_";
result << "#S=" << num_subgraphs << "_";
result << "targetDevice=" << targetDevice;
return result.str();
}
void Convert::SetUp() {
std::vector<ov::Shape> inputShape;
std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
init_input_shapes(static_shapes_to_test_representation(inputShape));
auto f = ov::test::snippets::ConvertFunction(inputShape, types.first[0], types.second[0]);
function = f.getOriginal();
output_type = types.second.front();
}
parameters Convert::generate_params_random() const {
int32_t startFrom, range, resolution = 5;
switch (output_type) {
case ov::element::f32:
case ov::element::i32:
case ov::element::bf16:
startFrom = -10;
range = 20;
break;
case ov::element::u8:
startFrom = -10;
range = 20;
break;
case ov::element::i8:
startFrom = 117;
range = 20;
break;
default:
startFrom = 0;
range = 10;
}
return {{ startFrom, range, resolution }};
}
void Convert::generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) {
inputs.clear();
const auto& funcInputs = function->inputs();
const auto params = generate_params_random();
if (params.size() != funcInputs.size()) {
IE_THROW() << "Incorrect count of parameters for random generation and inputs of function!";
}
for (int i = 0; i < funcInputs.size(); ++i) {
const auto& funcInput = funcInputs[i];
ov::Tensor tensor;
int32_t startFrom, range, resolution;
std::tie(startFrom, range, resolution) = params[i];
tensor = ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i],
range, startFrom, resolution);
inputs.insert({funcInput.get_node_shared_ptr(), tensor});
}
}
void ConvertInput::SetUp() {
std::vector<ov::Shape> inputShape;
std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
init_input_shapes(static_shapes_to_test_representation(inputShape));
auto f = ov::test::snippets::ConvertInputFunction(inputShape, types.first[0], types.second[0]);
function = f.getOriginal();
}
parameters ConvertInput::generate_params_random() const {
parameters params;
const auto& funcInputs = function->inputs();
for (int i = 0; i < funcInputs.size(); ++i) {
int32_t startFrom, range, resolution = 1;
switch (funcInputs[i].get_element_type()) {
case ov::element::f32:
case ov::element::bf16:
startFrom = -10;
range = 20;
resolution = 7;
break;
case ov::element::i32:
case ov::element::i8:
startFrom = -10;
range = 20;
break;
case ov::element::u8:
startFrom = 10;
range = 20;
break;
default:
startFrom = 0;
range = 10;
}
params.push_back({ startFrom, range, resolution });
}
return params;
}
void ConvertOutput::SetUp() {
std::vector<ov::Shape> inputShape;
std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
init_input_shapes(static_shapes_to_test_representation(inputShape));
auto f = ov::test::snippets::ConvertOutputFunction(inputShape, types.first[0], types.second[0]);
function = f.getOriginal();
output_type = types.second.front();
}
void ConvertStub::SetUp() {
std::vector<ov::Shape> inputShape;
std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
init_input_shapes(static_shapes_to_test_representation(inputShape));
auto f = ov::test::snippets::ConvertStubFunction(inputShape, types.first[0], types.second[0]);
function = f.getOriginal();
output_type = types.second.front();
}
void ConvertPartialInputsAndResults::SetUp() {
std::vector<ov::Shape> inputShape;
std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
init_input_shapes(static_shapes_to_test_representation(inputShape));
auto f = ov::test::snippets::ConvertPartialInputsAndResultsFunction(inputShape, types.first, types.second);
function = f.getOriginal();
}
void ConvertManyOnInputs::SetUp() {
std::vector<ov::Shape> inputShape;
std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
init_input_shapes(static_shapes_to_test_representation(inputShape));
auto f = ov::test::snippets::ConvertManyOnInputsFunction(inputShape, types.first);
function = f.getOriginal();
}
void ConvertManyOnOutputs::SetUp() {
std::vector<ov::Shape> inputShape;
std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
init_input_shapes(static_shapes_to_test_representation(inputShape));
auto f = ov::test::snippets::ConvertManyOnOutputsFunction(inputShape, types.first);
function = f.getOriginal();
}
void ConvertManyOnInputOutput::SetUp() {
std::vector<ov::Shape> inputShape;
std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
init_input_shapes(static_shapes_to_test_representation(inputShape));
auto f = ov::test::snippets::ConvertManyOnInputOutputFunction(inputShape, types.first, types.second);
function = f.getOriginal();
}
TEST_P(Convert, CompareWithRefImpl) {
run();
validateNumSubgraphs();
}
TEST_P(ConvertInput, CompareWithRefImpl) {
run();
validateNumSubgraphs();
}
TEST_P(ConvertOutput, CompareWithRefImpl) {
run();
validateNumSubgraphs();
}
TEST_P(ConvertStub, CompareWithRefImpl) {
run();
validateNumSubgraphs();
}
TEST_P(ConvertPartialInputsAndResults, CompareWithRefImpl) {
run();
validateNumSubgraphs();
}
TEST_P(ConvertManyOnInputs, CompareWithRefImpl) {
run();
validateNumSubgraphs();
}
TEST_P(ConvertManyOnOutputs, CompareWithRefImpl) {
run();
validateNumSubgraphs();
}
TEST_P(ConvertManyOnInputOutput, CompareWithRefImpl) {
run();
validateNumSubgraphs();
}
} // namespace snippets
} // namespace test
} // namespace ov

View File

@ -0,0 +1,44 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "common_test_utils/common_utils.hpp"
#include "snippets/eltwise_two_results.hpp"
#include "subgraph_simple.hpp"
namespace ov {
namespace test {
namespace snippets {
std::string EltwiseTwoResults::getTestCaseName(testing::TestParamInfo<ov::test::snippets::EltwiseTwoResultsParams> obj) {
ov::Shape inputShapes0, inputShapes1;
std::string targetDevice;
size_t num_nodes, num_subgraphs;
std::tie(inputShapes0, inputShapes1, num_nodes, num_subgraphs, targetDevice) = obj.param;
std::ostringstream result;
result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes0) << "_";
result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes1) << "_";
result << "#N=" << num_nodes << "_";
result << "#S=" << num_subgraphs << "_";
result << "targetDevice=" << targetDevice;
return result.str();
}
void EltwiseTwoResults::SetUp() {
ov::Shape inputShape0, inputShape1;
std::tie(inputShape0, inputShape1, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}});
auto f = ov::test::snippets::EltwiseTwoResultsFunction({inputShape0, inputShape1});
function = f.getOriginal();
}
TEST_P(EltwiseTwoResults, CompareWithRefImpl) {
run();
validateNumSubgraphs();
}
} // namespace snippets
} // namespace test
} // namespace ov

View File

@ -0,0 +1,49 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "common_test_utils/common_utils.hpp"
#include "snippets/max_num_params_eltwise.hpp"
#include "subgraph_simple.hpp"
namespace ov {
namespace test {
namespace snippets {
std::string MaxNumParamsEltwiseSinh::getTestCaseName(testing::TestParamInfo<ov::test::snippets::MaxNumParamsEltwiseParams> obj) {
ov::Shape inputShapes;
std::string targetDevice;
size_t num_nodes, num_subgraphs;
std::tie(inputShapes, num_nodes, num_subgraphs, targetDevice) = obj.param;
std::ostringstream result;
result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes) << "_";
result << "#N=" << num_nodes << "_";
result << "#S=" << num_subgraphs << "_";
result << "targetDevice=" << targetDevice;
return result.str();
}
void MaxNumParamsEltwiseSinh::SetUp() {
ov::Shape inputShape;
std::tie(inputShape, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
std::vector<ov::Shape> expandedShapes(10, inputShape);
std::vector<InputShape> input_shapes;
for (const auto& s : expandedShapes) {
input_shapes.emplace_back(InputShape {{}, {s, }});
}
init_input_shapes(input_shapes);
auto f = ov::test::snippets::EltwiseMaxNumParamsSinhFunction(expandedShapes);
function = f.getOriginal();
}
TEST_P(MaxNumParamsEltwiseSinh, CompareWithRefImpl) {
run();
validateNumSubgraphs();
}
} // namespace snippets
} // namespace test
} // namespace ov

View File

@ -10,42 +10,42 @@ namespace ov {
namespace test {
namespace snippets {
std::string ThreeInputsEltwise::getTestCaseName(testing::TestParamInfo<ov::test::snippets::ThreeInputsEltwiseParams> obj) {
ov::Shape inputShapes0, inputShapes1, inputShapes2;
std::string targetDevice;
size_t num_nodes, num_subgraphs;
std::tie(inputShapes0, inputShapes1, inputShapes2,
num_nodes, num_subgraphs, targetDevice) = obj.param;
std::string ThreeInputsEltwise::getTestCaseName(testing::TestParamInfo<ov::test::snippets::ThreeInputsEltwiseParams> obj) {
ov::Shape inputShapes0, inputShapes1, inputShapes2;
std::string targetDevice;
size_t num_nodes, num_subgraphs;
std::tie(inputShapes0, inputShapes1, inputShapes2,
num_nodes, num_subgraphs, targetDevice) = obj.param;
std::ostringstream result;
result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes0) << "_";
result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes1) << "_";
result << "IS[2]=" << CommonTestUtils::vec2str(inputShapes2) << "_";
result << "#N=" << num_nodes << "_";
result << "#S=" << num_subgraphs << "_";
result << "targetDevice=" << targetDevice;
return result.str();
}
std::ostringstream result;
result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes0) << "_";
result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes1) << "_";
result << "IS[2]=" << CommonTestUtils::vec2str(inputShapes2) << "_";
result << "#N=" << num_nodes << "_";
result << "#S=" << num_subgraphs << "_";
result << "targetDevice=" << targetDevice;
return result.str();
}
void ThreeInputsEltwise::SetUp() {
ov::Shape inputShape0, inputShape1, inputShape2;
std::tie(inputShape0, inputShape1, inputShape2,
ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}, {{}, {inputShape2, }}});
void ThreeInputsEltwise::SetUp() {
ov::Shape inputShape0, inputShape1, inputShape2;
std::tie(inputShape0, inputShape1, inputShape2,
ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}, {{}, {inputShape2, }}});
auto f = ov::test::snippets::EltwiseThreeInputsFunction({inputShape0, inputShape1, inputShape2});
function = f.getOriginal();
}
auto f = ov::test::snippets::EltwiseThreeInputsFunction({inputShape0, inputShape1, inputShape2});
function = f.getOriginal();
}
void ThreeInputsEltwiseSinh::SetUp() {
ov::Shape inputShape0, inputShape1, inputShape2;
std::tie(inputShape0, inputShape1, inputShape2,
ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}, {{}, {inputShape2, }}});
void ThreeInputsEltwiseSinh::SetUp() {
ov::Shape inputShape0, inputShape1, inputShape2;
std::tie(inputShape0, inputShape1, inputShape2,
ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}, {{}, {inputShape2, }}});
auto f = ov::test::snippets::EltwiseThreeInputsSinhFunction({inputShape0, inputShape1, inputShape2});
function = f.getOriginal();
}
auto f = ov::test::snippets::EltwiseThreeInputsSinhFunction({inputShape0, inputShape1, inputShape2});
function = f.getOriginal();
}
TEST_P(ThreeInputsEltwise, CompareWithRefImpl) {
run();

View File

@ -0,0 +1,43 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "common_test_utils/common_utils.hpp"
#include "snippets/two_inputs_and_outputs.hpp"
#include "subgraph_simple.hpp"
namespace ov {
namespace test {
namespace snippets {
std::string TwoInputsAndOutputs::getTestCaseName(testing::TestParamInfo<ov::test::snippets::TwoInputsAndOutputsParams> obj) {
std::vector<ov::Shape> inputShapes;
std::string targetDevice;
size_t num_nodes, num_subgraphs;
std::tie(inputShapes, num_nodes, num_subgraphs, targetDevice) = obj.param;
std::ostringstream result;
for (auto i = 0; i < inputShapes.size(); i++)
result << "IS[" << i << "]=" << CommonTestUtils::vec2str(inputShapes[i]) << "_";
result << "#N=" << num_nodes << "_";
result << "#S=" << num_subgraphs << "_";
result << "targetDevice=" << targetDevice;
return result.str();
}
void TwoInputsAndOutputs::SetUp() {
std::vector<ov::Shape> inputShape;
std::tie(inputShape, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
init_input_shapes(static_shapes_to_test_representation(inputShape));
auto f = ov::test::snippets::TwoInputsAndOutputsFunction(inputShape);
function = f.getOriginal();
}
TEST_P(TwoInputsAndOutputs, CompareWithRefImpl) {
run();
validateNumSubgraphs();
}
} // namespace snippets
} // namespace test
} // namespace ov

View File

@ -3,11 +3,16 @@
//
#include "shared_test_classes/base/snippets_test_utils.hpp"
#include "functional_test_utils/skip_tests_config.hpp"
#include "exec_graph_info.hpp"
namespace ov {
namespace test {
void SnippetsTestsCommon::validateNumSubgraphs() {
bool isCurrentTestDisabled = FuncTestUtils::SkipTestsConfig::currentTestIsDisabled();
if (isCurrentTestDisabled)
GTEST_SKIP() << "Disabled test due to configuration" << std::endl;
const auto& compiled_model = compiledModel.get_runtime_model();
size_t num_subgraphs = 0;
size_t num_nodes = 0;

View File

@ -0,0 +1,214 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "ngraph/ngraph.hpp"
#include "./snippets_helpers.hpp"
/* This file contains definitions of relatively simple functions (models) that will be used
* to test snippets-specific behavior. All the functions are expected to be direct descendants of
* SnippetsFunctionBase, so their constructors take only one (inputShapes) argument.
*/
namespace ov {
namespace test {
namespace snippets {
/// The most trivial graph, just one Convert.
/// Tokenized simply by starting subgraph.
// in1
// Convert
// Result
class ConvertFunction : public SnippetsFunctionBase {
public:
explicit ConvertFunction(const std::vector<Shape>& inputShapes,
const ov::element::Type inType = ov::element::f32,
const ov::element::Type outType = ov::element::u8)
: SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) {
NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes");
}
protected:
std::shared_ptr<ov::Model> initOriginal() const override;
std::shared_ptr<ov::Model> initReference() const override;
ov::element::Type inType;
ov::element::Type outType;
};
/// The one of the input of Add is Convert
/// Tokenized simply by starting subgraph.
// in1
// Convert in2
// Add
// Result
class ConvertInputFunction : public SnippetsFunctionBase {
public:
explicit ConvertInputFunction(const std::vector<Shape>& inputShapes,
const ov::element::Type inType = ov::element::f32,
const ov::element::Type outType = ov::element::u8)
: SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) {
NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
}
protected:
std::shared_ptr<ov::Model> initOriginal() const override;
std::shared_ptr<ov::Model> initReference() const override;
ov::element::Type inType;
ov::element::Type outType;
};
/// The output of Sub is Convert
/// Tokenized simply by starting subgraph.
// in1 in2
// Sub
// Convert
// Result
class ConvertOutputFunction : public SnippetsFunctionBase {
public:
explicit ConvertOutputFunction(const std::vector<Shape>& inputShapes,
const ov::element::Type inType = ov::element::f32,
const ov::element::Type outType = ov::element::i8)
: SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) {
NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
}
protected:
std::shared_ptr<ov::Model> initOriginal() const override;
std::shared_ptr<ov::Model> initReference() const override;
ov::element::Type inType;
ov::element::Type outType;
};
/// There are 2 subgraphs: Add + Convert(Stub) and Relu
/// Tokenized simply by starting subgraph.
// in1 in2 in1 in2
// Add Subgraph
// Convert -> |
// Relu Subgraph
// Result Result
class ConvertStubFunction : public SnippetsFunctionBase {
public:
explicit ConvertStubFunction(const std::vector<Shape>& inputShapes,
const ov::element::Type inType = ov::element::f32,
const ov::element::Type outType = ov::element::i8)
: SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) {
NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
}
protected:
std::shared_ptr<ov::Model> initOriginal() const override;
std::shared_ptr<ov::Model> initReference() const override;
ov::element::Type inType;
ov::element::Type outType;
};
/// Not all Inputs and Results have Convert
/// Tokenized simply by starting subgraph.
// in1 in2
// Convert Convert
// Add
// Relu in3
// Convert Sub
// Result1 Unsqueeze <- It's to avoid many result output for subgraph (it's a limitation of collapsing)
// Result2
class ConvertPartialInputsAndResultsFunction : public SnippetsFunctionBase {
public:
explicit ConvertPartialInputsAndResultsFunction(const std::vector<Shape>& inputShapes,
const std::vector<ov::element::Type>& inTypes = {ov::element::f32},
const std::vector<ov::element::Type>& outTypes = {ov::element::f32})
: SnippetsFunctionBase(inputShapes), inTypes(inTypes), outTypes(outTypes) {
NGRAPH_CHECK(input_shapes.size() == 3, "Got invalid number of input shapes");
}
protected:
std::shared_ptr<ov::Model> initOriginal() const override;
std::shared_ptr<ov::Model> initReference() const override;
std::vector<ov::element::Type> inTypes;
std::vector<ov::element::Type> outTypes;
};
/// Convert Sequence on input
/// Tokenized simply by starting subgraph.
// in in
// Stub Stub
// Convert |
// Convert -> Subgraph
// Convert |
// Relu Result
// Result
class ConvertManyOnInputsFunction : public SnippetsFunctionBase {
public:
explicit ConvertManyOnInputsFunction(const std::vector<Shape>& inputShapes, const std::vector<ov::element::Type>& types)
: SnippetsFunctionBase(inputShapes), types(types) {
NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes");
NGRAPH_CHECK(types.size() > 1, "Got invalid number of element types");
}
protected:
std::shared_ptr<ov::Model> initOriginal() const override;
std::shared_ptr<ov::Model> initReference() const override;
std::vector<ov::element::Type> types;
};
/// Convert Sequence on output
/// Tokenized simply by starting subgraph.
// in in
// Stub Stub
// Relu |
// Convert -> Subgraph
// Convert |
// Convert |
// Result Result
class ConvertManyOnOutputsFunction : public SnippetsFunctionBase {
public:
explicit ConvertManyOnOutputsFunction(const std::vector<Shape>& inputShapes, const std::vector<ov::element::Type>& types)
: SnippetsFunctionBase(inputShapes), types(types) {
NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes");
NGRAPH_CHECK(types.size() > 1, "Got invalid number of element types");
}
protected:
std::shared_ptr<ov::Model> initOriginal() const override;
std::shared_ptr<ov::Model> initReference() const override;
std::vector<ov::element::Type> types;
};
/// Convert Sequence on input and output
/// Tokenized simply by starting subgraph.
// in in
// Stub Stub
// Convert |
// Convert |
// Convert |
// Relu -> Subgraph
// Convert |
// Convert |
// Convert |
// Result Result
class ConvertManyOnInputOutputFunction : public SnippetsFunctionBase {
public:
explicit ConvertManyOnInputOutputFunction(const std::vector<Shape>& inputShapes,
const std::vector<ov::element::Type>& inTypes,
const std::vector<ov::element::Type>& outTypes)
: SnippetsFunctionBase(inputShapes), inTypes(inTypes), outTypes(outTypes) {
NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes");
NGRAPH_CHECK(inTypes.size() > 1, "Got invalid number of input element types");
NGRAPH_CHECK(outTypes.size() > 0, "Got invalid number of output element types");
}
protected:
std::shared_ptr<ov::Model> initOriginal() const override;
std::shared_ptr<ov::Model> initReference() const override;
std::vector<ov::element::Type> inTypes;
std::vector<ov::element::Type> outTypes;
};
} // namespace snippets
} // namespace test
} // namespace ov

View File

@ -7,6 +7,7 @@
#include "ngraph/ngraph.hpp"
#include "snippets_helpers.hpp"
#include "subgraph_simple.hpp"
#include "subgraph_converts.hpp"
/* This file provides lowered representations (after the generate() was calles) for some simple functions.
* This is required to test snippets lowering and optimization passes. All the functions are expected to be direct
@ -45,7 +46,7 @@ public:
protected:
std::shared_ptr<ov::Model> initLowered() const override;
private:
std::vector<Shape> broadcast_shapes;;
std::vector<Shape> broadcast_shapes;
};
} // namespace snippets

View File

@ -29,13 +29,14 @@ protected:
std::shared_ptr<ov::Model> initOriginal() const override;
std::shared_ptr<ov::Model> initReference() const override;
};
/// Add separated from inputs by Sin to WA CPU-specific disabling after inputs.
/// Add separated from inputs by Sinh to WA CPU-specific disabling after inputs.
/// Works because Sinh is not supported by tokenization yet.
/// Tokenized simply by starting subgraph.
// in1 in2
// Sin Sinh
// Sinh Sinh
// Add
// Result
// todo: remove Sinh once "no subgraph after input" limitation is relaxed
class AddSinhFunction : public SnippetsFunctionBase {
public:
explicit AddSinhFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
@ -45,6 +46,21 @@ protected:
std::shared_ptr<ov::Model> initOriginal() const override;
std::shared_ptr<ov::Model> initReference() const override;
};
/// Like AddSinh but with a constant second input (and no sinh on in)
// in1 in2
// Sin Sinh
// Add
// Result
// todo: remove Sinh once "no subgraph after input" limitation is relaxed
class AddSinhConstFunction : public SnippetsFunctionBase {
public:
explicit AddSinhConstFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes");
}
protected:
std::shared_ptr<ov::Model> initOriginal() const override;
// std::shared_ptr<ov::Model> initReference() const override;
};
/// Simple Eltwise graph fully convertible to Subgraph.
/// Tokenized simply by attaching eltwises.
// in1 in2
@ -77,6 +93,7 @@ protected:
};
/// EltwiseFunctionThreeInputs with Sinh after inputs to to WA CPU-specific disabling after inputs
/// See AddSinh for details.
// todo: remove Sinh once "no subgraph after input" limitation is relaxed
class EltwiseThreeInputsSinhFunction : public SnippetsFunctionBase {
public:
explicit EltwiseThreeInputsSinhFunction(const std::vector<Shape>& inputShapes) :
@ -86,6 +103,24 @@ public:
protected:
std::shared_ptr<ov::Model> initOriginal() const override;
};
/// Eltwise graph with 10 inputs and 2 outputs.
/// Needed to test for a max number of inputs+outputs allowed.
// in1 in2 in3 ... in10
// Sinh Sinh Sinh ...Sinh
// ........................
// Subtract Power
// \ Sinh
// Result
// todo: remove Sinh once "no subgraph after input" limitation is relaxed
class EltwiseMaxNumParamsSinhFunction : public SnippetsFunctionBase {
public:
explicit EltwiseMaxNumParamsSinhFunction(const std::vector<Shape>& inputShapes) :
SnippetsFunctionBase(inputShapes) {
NGRAPH_CHECK(input_shapes.size() == 10, "Got invalid number of input shapes");
}
protected:
std::shared_ptr<ov::Model> initOriginal() const override;
};
/// MatMul with two eltwise branches joined with Add just before the Result.
/// Tokenized by attaching eltwises to separate subgraphs, and then joining them together.
// in1 in2
@ -125,7 +160,41 @@ protected:
std::shared_ptr<ov::Model> initOriginal() const override;
std::shared_ptr<ov::Model> initReference() const override;
};
/// 2 results.
/// So we have 2 subgraphs - Snippets don't support subgraphs with many results
/// Also Output tensors have names to check correct copying output names
// in1 in2
// Sinh Sinh
// Add
// HSwish Result
// Relu
// Result
class EltwiseTwoResultsFunction : public SnippetsFunctionBase {
public:
explicit EltwiseTwoResultsFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
}
protected:
std::shared_ptr<ov::Model> initOriginal() const override;
std::shared_ptr<ov::Model> initReference() const override;
};
/// Two different Input and Outputs.
/// This function is to check correct Broadcasting
// in1 in2
// Sin Sin
// HSwish /
// Result Add
// Relu
// Sin
// Result
class TwoInputsAndOutputsFunction : public SnippetsFunctionBase {
public:
explicit TwoInputsAndOutputsFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
}
protected:
std::shared_ptr<ov::Model> initOriginal() const override;
};
} // namespace snippets
} // namespace test
} // namespace ov

View File

@ -0,0 +1,241 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "subgraph_converts.hpp"
#include "common_test_utils/data_utils.hpp"
#include <snippets/op/convert_truncation.hpp>
#include <snippets/op/subgraph.hpp>
namespace ov {
namespace test {
namespace snippets {
std::shared_ptr<ov::Node> createRollAsStub(const std::shared_ptr<ov::Node>& parent) {
auto shift = std::make_shared<op::v0::Constant>(ov::element::i32, Shape{1}, std::vector<int>{1});
auto axes = std::make_shared<op::v0::Constant>(ov::element::i32, Shape{1}, std::vector<int>{0});
return std::make_shared<op::v7::Roll>(parent->output(0), shift, axes);
}
std::shared_ptr<ov::Model> ConvertFunction::initOriginal() const {
auto data0 = std::make_shared<op::v0::Parameter>(inType, input_shapes[0]);
auto stub = createRollAsStub(data0);
auto convert = std::make_shared<op::v0::Convert>(stub, outType);
return std::make_shared<ov::Model>(NodeVector{convert}, ParameterVector{data0});
}
std::shared_ptr<ov::Model> ConvertFunction::initReference() const {
auto data0 = std::make_shared<op::v0::Parameter>(inType, input_shapes[0]);
auto stub = createRollAsStub(data0);
auto indata0 = std::make_shared<op::v0::Parameter>(inType, stub->get_shape());
auto subgraph = std::make_shared<ngraph::snippets::op::Subgraph>(NodeVector{stub},
std::make_shared<ov::Model>(NodeVector{std::make_shared<ngraph::snippets::op::ConvertTruncation>(indata0, outType)},
ParameterVector{indata0}));
return std::make_shared<ov::Model>(NodeVector{subgraph}, ParameterVector{data0});
}
std::shared_ptr<ov::Model> ConvertInputFunction::initOriginal() const {
auto data0 = std::make_shared<op::v0::Parameter>(inType, input_shapes[0]);
auto data1 = std::make_shared<op::v0::Parameter>(outType, input_shapes[1]);
auto stub0 = createRollAsStub(data0);
auto stub1 = createRollAsStub(data1);
auto convert = std::make_shared<op::v0::Convert>(stub0, outType);
auto add = std::make_shared<op::v1::Add>(convert, stub1);
return std::make_shared<ov::Model>(NodeVector{add}, ParameterVector{data0, data1});
}
std::shared_ptr<ov::Model> ConvertInputFunction::initReference() const {
auto data0 = std::make_shared<op::v0::Parameter>(inType, input_shapes[0]);
auto data1 = std::make_shared<op::v0::Parameter>(outType, input_shapes[1]);
auto stub0 = createRollAsStub(data0);
auto stub1 = createRollAsStub(data1);
auto indata0 = std::make_shared<op::v0::Parameter>(inType, stub0->get_shape());
auto indata1 = std::make_shared<op::v0::Parameter>(outType, stub1->get_shape());
auto convert = std::make_shared<ngraph::snippets::op::ConvertTruncation>(indata0, outType);
auto subgraph = std::make_shared<ngraph::snippets::op::Subgraph>(NodeVector{stub0, stub1},
std::make_shared<ov::Model>(
NodeVector{std::make_shared<op::v1::Add>(convert, indata1)},
ParameterVector{indata0, indata1}));
return std::make_shared<ov::Model>(NodeVector{subgraph}, ParameterVector{data0, data1});
}
std::shared_ptr<ov::Model> ConvertOutputFunction::initOriginal() const {
auto data0 = std::make_shared<op::v0::Parameter>(inType, input_shapes[0]);
auto data1 = std::make_shared<op::v0::Parameter>(inType, input_shapes[1]);
auto stub0 = createRollAsStub(data0);
auto stub1 = createRollAsStub(data1);
auto add = std::make_shared<op::v1::Add>(stub0, stub1);
auto convert = std::make_shared<op::v0::Convert>(add, outType);
return std::make_shared<ov::Model>(NodeVector{convert}, ParameterVector{data0, data1});
}
std::shared_ptr<ov::Model> ConvertOutputFunction::initReference() const {
auto data0 = std::make_shared<op::v0::Parameter>(inType, input_shapes[0]);
auto data1 = std::make_shared<op::v0::Parameter>(inType, input_shapes[1]);
auto stub0 = createRollAsStub(data0);
auto stub1 = createRollAsStub(data1);
auto indata0 = std::make_shared<op::v0::Parameter>(inType, stub0->get_shape());
auto indata1 = std::make_shared<op::v0::Parameter>(inType, stub1->get_shape());
auto add = std::make_shared<op::v1::Add>(indata0, indata1);
auto convert = std::make_shared<ngraph::snippets::op::ConvertTruncation>(add, outType);
auto subgraph = std::make_shared<ngraph::snippets::op::Subgraph>(NodeVector{stub0, stub1},
std::make_shared<ov::Model>(
NodeVector{convert},
ParameterVector{indata0, indata1}));
return std::make_shared<ov::Model>(NodeVector{subgraph}, ParameterVector{data0, data1});
}
std::shared_ptr<ov::Model> ConvertStubFunction::initOriginal() const {
auto data0 = std::make_shared<op::v0::Parameter>(inType, input_shapes[0]);
auto data1 = std::make_shared<op::v0::Parameter>(inType, input_shapes[1]);
auto stub0 = createRollAsStub(data0);
auto stub1 = createRollAsStub(data1);
auto add = std::make_shared<op::v1::Add>(stub0, stub1);
auto convert = std::make_shared<op::v0::Convert>(add, outType);
auto relu = std::make_shared<op::v0::Relu>(convert);
return std::make_shared<ov::Model>(NodeVector{relu}, ParameterVector{data0, data1});
}
std::shared_ptr<ov::Model> ConvertStubFunction::initReference() const {
auto data0 = std::make_shared<op::v0::Parameter>(inType, input_shapes[0]);
auto data1 = std::make_shared<op::v0::Parameter>(inType, input_shapes[1]);
auto stub0 = createRollAsStub(data0);
auto stub1 = createRollAsStub(data1);
auto indata0 = std::make_shared<op::v0::Parameter>(inType, stub0->get_shape());
auto indata1 = std::make_shared<op::v0::Parameter>(inType, stub1->get_shape());
auto add = std::make_shared<op::v1::Add>(indata0, indata1);
auto convert = std::make_shared<ngraph::snippets::op::ConvertTruncation>(add, outType);
auto subgraph0 = std::make_shared<ngraph::snippets::op::Subgraph>(
NodeVector{stub0, stub1}, std::make_shared<ov::Model>(NodeVector{convert}, ParameterVector{indata0, indata1}));
auto indata2 = std::make_shared<op::v0::Parameter>(convert->get_destination_type(), convert->get_shape());
auto relu = std::make_shared<op::v0::Relu>(indata2);
auto subgraph1 = std::make_shared<ngraph::snippets::op::Subgraph>(
NodeVector{subgraph0}, std::make_shared<ov::Model>(NodeVector{relu}, ParameterVector{indata2}));
return std::make_shared<ov::Model>(NodeVector{subgraph1}, ParameterVector{data0, data1});
}
std::shared_ptr<ov::Model> ConvertPartialInputsAndResultsFunction::initOriginal() const {
auto data0 = std::make_shared<op::v0::Parameter>(inTypes[0], input_shapes[0]);
auto data1 = std::make_shared<op::v0::Parameter>(inTypes[1], input_shapes[1]);
auto data2 = std::make_shared<op::v0::Parameter>(inTypes[2], input_shapes[2]);
auto stub0 = createRollAsStub(data0);
auto stub1 = createRollAsStub(data1);
auto stub2 = createRollAsStub(data2);
auto convert0 = std::make_shared<op::v0::Convert>(stub0, outTypes[0]);
auto convert1 = std::make_shared<op::v0::Convert>(stub1, outTypes[0]);
auto add = std::make_shared<op::v1::Add>(convert0, convert1);
auto relu = std::make_shared<op::v0::Relu>(add);
auto sub = std::make_shared<op::v1::Subtract>(relu, stub2);
auto stub3 = createRollAsStub(sub);
auto convert2 = std::make_shared<op::v0::Convert>(relu, outTypes[1]);
return std::make_shared<ov::Model>(NodeVector{convert2, stub3}, ParameterVector{data0, data1, data2});
}
std::shared_ptr<ov::Model> ConvertPartialInputsAndResultsFunction::initReference() const {
auto data0 = std::make_shared<op::v0::Parameter>(inTypes[0], input_shapes[0]);
auto data1 = std::make_shared<op::v0::Parameter>(inTypes[1], input_shapes[1]);
auto data2 = std::make_shared<op::v0::Parameter>(inTypes[2], input_shapes[2]);
auto stub0 = createRollAsStub(data0);
auto stub1 = createRollAsStub(data1);
auto stub2 = createRollAsStub(data2);
auto indata0 = std::make_shared<op::v0::Parameter>(inTypes[0], stub0->get_shape());
auto indata1 = std::make_shared<op::v0::Parameter>(inTypes[1], stub1->get_shape());
auto indata2 = std::make_shared<op::v0::Parameter>(inTypes[2], stub2->get_shape());
auto convert0 = std::make_shared<ngraph::snippets::op::ConvertTruncation>(indata0, outTypes[0]);
auto convert1 = std::make_shared<ngraph::snippets::op::ConvertTruncation>(indata1, outTypes[0]);
auto add = std::make_shared<op::v1::Add>(convert0, convert1);
auto relu = std::make_shared<op::v0::Relu>(add);
auto sub = std::make_shared<op::v1::Subtract>(relu, indata2);
auto convert2 = std::make_shared<ngraph::snippets::op::ConvertTruncation>(relu, outTypes[1]);
auto subgraph = std::make_shared<ngraph::snippets::op::Subgraph>(
NodeVector{stub0, stub1, stub2}, std::make_shared<ov::Model>(NodeVector{sub, convert2}, ParameterVector{indata0, indata1, indata2}));
auto stub3 = createRollAsStub(subgraph);
return std::make_shared<ov::Model>(OutputVector{subgraph->output(1), stub3->output(0)},
ParameterVector{data0, data1, data2});
}
std::shared_ptr<ov::Model> ConvertManyOnInputsFunction::initOriginal() const {
auto data0 = std::make_shared<op::v0::Parameter>(types[0], input_shapes[0]);
auto stub0 = createRollAsStub(data0);
std::shared_ptr<ov::Node> out = stub0;
for (auto i = 1; i < types.size(); i++) {
auto convert = std::make_shared<op::v0::Convert>(out, types[i]);
out = convert;
}
auto relu = std::make_shared<op::v0::Relu>(out);
return std::make_shared<ov::Model>(NodeVector{relu}, ParameterVector{data0});
}
std::shared_ptr<ov::Model> ConvertManyOnInputsFunction::initReference() const {
auto data0 = std::make_shared<op::v0::Parameter>(types[0], input_shapes[0]);
auto stub0 = createRollAsStub(data0);
auto indata0 = std::make_shared<op::v0::Parameter>(types[0], stub0->get_shape());
std::shared_ptr<ov::Node> out = indata0;
for (auto i = 1; i < types.size(); i++) {
auto convert = std::make_shared<ngraph::snippets::op::ConvertTruncation>(out, types[i]);
out = convert;
}
auto relu = std::make_shared<op::v0::Relu>(out);
auto subgraph = std::make_shared<ngraph::snippets::op::Subgraph>(NodeVector{stub0},
std::make_shared<ov::Model>(NodeVector{relu}, ParameterVector{indata0}));
return std::make_shared<ov::Model>(NodeVector{subgraph}, ParameterVector{data0});
}
std::shared_ptr<ov::Model> ConvertManyOnOutputsFunction::initOriginal() const {
auto data0 = std::make_shared<op::v0::Parameter>(types[0], input_shapes[0]);
auto stub0 = std::make_shared<ov::op::v0::Sinh>(data0);
auto relu = std::make_shared<op::v0::Relu>(stub0);
std::shared_ptr<ov::Node> out = relu;
for (auto i = 1; i < types.size(); i++) {
auto convert = std::make_shared<op::v0::Convert>(out, types[i]);
out = convert;
}
return std::make_shared<ov::Model>(NodeVector{out}, ParameterVector{data0});
}
std::shared_ptr<ov::Model> ConvertManyOnOutputsFunction::initReference() const {
auto data0 = std::make_shared<op::v0::Parameter>(types[0], input_shapes[0]);
auto stub0 = std::make_shared<ov::op::v0::Sinh>(data0);
auto indata0 = std::make_shared<op::v0::Parameter>(types[0], stub0->get_shape());
auto relu = std::make_shared<op::v0::Relu>(indata0);
std::shared_ptr<ov::Node> out = relu;
for (auto i = 1; i < types.size(); i++) {
auto convert = std::make_shared<ngraph::snippets::op::ConvertTruncation>(out, types[i]);
out = convert;
}
auto subgraph = std::make_shared<ngraph::snippets::op::Subgraph>(NodeVector{stub0},
std::make_shared<ov::Model>(NodeVector{out}, ParameterVector{indata0}));
return std::make_shared<ov::Model>(NodeVector{subgraph}, ParameterVector{data0});
}
std::shared_ptr<ov::Model> ConvertManyOnInputOutputFunction::initOriginal() const {
auto data0 = std::make_shared<op::v0::Parameter>(inTypes[0], input_shapes[0]);
auto stub0 = std::make_shared<ov::op::v0::Sinh>(data0);
std::shared_ptr<ov::Node> out = stub0;
for (auto i = 1; i < inTypes.size(); i++) {
auto convert = std::make_shared<op::v0::Convert>(out, inTypes[i]);
out = convert;
}
auto relu = std::make_shared<op::v0::Relu>(stub0);
out = relu;
for (auto i = 0; i < outTypes.size(); i++) {
auto convert = std::make_shared<op::v0::Convert>(out, outTypes[i]);
out = convert;
}
return std::make_shared<ov::Model>(NodeVector{out}, ParameterVector{data0});
}
std::shared_ptr<ov::Model> ConvertManyOnInputOutputFunction::initReference() const {
auto data0 = std::make_shared<op::v0::Parameter>(inTypes[0], input_shapes[0]);
auto stub0 = std::make_shared<ov::op::v0::Sinh>(data0);
auto indata0 = std::make_shared<op::v0::Parameter>(inTypes[0], stub0->get_shape());
std::shared_ptr<ov::Node> out = indata0;
for (auto i = 1; i < inTypes.size(); i++) {
auto convert = std::make_shared<op::v0::Convert>(out, inTypes[i]);
out = convert;
}
auto relu = std::make_shared<op::v0::Relu>(stub0);
out = relu;
for (auto i = 0; i < outTypes.size(); i++) {
auto convert = std::make_shared<op::v0::Convert>(out, outTypes[i]);
out = convert;
}
auto subgraph = std::make_shared<ngraph::snippets::op::Subgraph>(NodeVector{stub0},
std::make_shared<ov::Model>(NodeVector{out}, ParameterVector{indata0}));
return std::make_shared<ov::Model>(NodeVector{subgraph}, ParameterVector{data0});
}
} // namespace snippets
} // namespace test
} // namespace ov

View File

@ -46,6 +46,14 @@ std::shared_ptr<ov::Model> AddSinhFunction::initReference() const {
ParameterVector{indata0, indata1}));
return std::make_shared<ov::Model>(NodeVector{add}, ParameterVector{data0, data1});
}
std::shared_ptr<ov::Model> AddSinhConstFunction::initOriginal() const {
auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
const std::vector<float> const_values = CommonTestUtils::generate_float_numbers(shape_size(input_shapes[0]), -10., 10.);
auto const_data1 = std::make_shared<op::v0::Constant>(precision, input_shapes[0], const_values);
auto sin0 = std::make_shared<ov::op::v0::Sinh>(data0);
auto add = std::make_shared<op::v1::Add>(sin0, const_data1);
return std::make_shared<ov::Model>(NodeVector{add}, ParameterVector{data0});
}
std::shared_ptr<ov::Model> EltwiseFunction::initOriginal() const {
auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
@ -98,6 +106,28 @@ std::shared_ptr<ov::Model> EltwiseThreeInputsSinhFunction::initOriginal() const
auto mul = std::make_shared<op::v1::Multiply>(add, sub);
return std::make_shared<ov::Model>(NodeVector{mul}, ParameterVector{data0, data1, data2});
}
std::shared_ptr<ov::Model> EltwiseMaxNumParamsSinhFunction::initOriginal() const {
ParameterVector params;
std::vector<std::shared_ptr<Node>> sinh; // 10
for (const auto& shape : input_shapes) {
auto param = std::make_shared<op::v0::Parameter>(precision, shape);
params.push_back(param);
sinh.push_back(std::make_shared<op::v0::Sinh>(param));
}
std::vector<std::shared_ptr<Node>> add; // 5
for (size_t i = 0; i < input_shapes.size() / 2; i++) {
add.push_back(std::make_shared<op::v1::Add>(sinh[i * 2], sinh[i * 2 + 1]));
}
std::vector<std::shared_ptr<Node>> mul; // 2
for (size_t i = 0; i < add.size() / 2; i++) {
auto mul_node = std::make_shared<op::v1::Multiply>(add[i * 2], add[i * 2 + 1]);
mul.push_back(mul_node);
}
auto sub = std::make_shared<op::v1::Subtract>(mul[0], mul[1]);
auto power = std::make_shared<op::v1::Power>(add.back(), sub);
auto exit_sinh = std::make_shared<op::v0::Sinh>(power);
return std::make_shared<ov::Model>(NodeVector{sub, exit_sinh}, params);
}
std::shared_ptr<ov::Model> MatMulEltwiseBranchesFunction::initOriginal() const {
auto data_1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
@ -187,6 +217,69 @@ std::shared_ptr<ov::Model> EltwiseLogLoopFunction::initReference() const {
return std::make_shared<Model>(NodeVector{mul}, ParameterVector{data0, data1});
}
std::shared_ptr<ov::Model> EltwiseTwoResultsFunction::initOriginal() const {
auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
auto sinh0 = std::make_shared<op::v0::Sinh>(data0);
auto sinh1 = std::make_shared<op::v0::Sinh>(data1);
auto add = std::make_shared<op::v1::Add>(sinh0, sinh1);
auto hswish = std::make_shared<op::v4::HSwish>(add);
auto relu = std::make_shared<op::v0::Relu>(hswish);
NGRAPH_SUPPRESS_DEPRECATED_START
auto& out_tensor0 = add->get_output_tensor(0);
out_tensor0.set_name("add_out");
out_tensor0.set_names({"add_out", "y0"});
auto& out_tensor1 = relu->get_output_tensor(0);
out_tensor1.set_name("relu_out");
out_tensor1.set_names({"relu_out", "y1"});
NGRAPH_SUPPRESS_DEPRECATED_END
return std::make_shared<Model>(NodeVector{add, relu}, ParameterVector{data0, data1});
}
std::shared_ptr<ov::Model> EltwiseTwoResultsFunction::initReference() const {
auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
auto sinh0 = std::make_shared<op::v0::Sinh>(data0);
auto sinh1 = std::make_shared<op::v0::Sinh>(data1);
auto indata0 = std::make_shared<op::v0::Parameter>(precision, sinh0->get_shape());
auto indata1 = std::make_shared<op::v0::Parameter>(precision, sinh1->get_shape());
auto add = std::make_shared<op::v1::Add>(indata0, indata1);
auto hswish = std::make_shared<op::v4::HSwish>(add);
auto subgraph0 = std::make_shared<ngraph::snippets::op::Subgraph>(NodeVector{sinh0, sinh1},
std::make_shared<ov::Model>(NodeVector{add, hswish},
ParameterVector{indata0, indata1}));
auto indata2 = std::make_shared<op::v0::Parameter>(precision, subgraph0->get_output_shape(1));
auto relu = std::make_shared<op::v0::Relu>(indata2);
auto subgraph1 = std::make_shared<ngraph::snippets::op::Subgraph>(OutputVector{subgraph0->output(1)},
std::make_shared<ov::Model>(NodeVector{relu},
ParameterVector{indata2}));
NGRAPH_SUPPRESS_DEPRECATED_START
auto& out_tensor0 = subgraph0->get_output_tensor(0);
out_tensor0.set_name("add_out");
out_tensor0.set_names({"add_out", "y0"});
auto& out_tensor1 = subgraph1->get_output_tensor(0);
out_tensor1.set_name("relu_out");
out_tensor1.set_names({"relu_out", "y1"});
NGRAPH_SUPPRESS_DEPRECATED_END
return std::make_shared<Model>(OutputVector{subgraph0->output(0), subgraph1->output(0)}, ParameterVector{data0, data1});
}
std::shared_ptr<ov::Model> TwoInputsAndOutputsFunction::initOriginal() const {
auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
auto sin0 = std::make_shared<op::v0::Sin>(data0);
auto sin1 = std::make_shared<op::v0::Sin>(data1);
auto hswish = std::make_shared<op::v4::HSwish>(sin0);
auto add = std::make_shared<op::v1::Add>(hswish, sin1);
auto relu = std::make_shared<op::v0::Relu>(add);
auto sin3 = std::make_shared<op::v0::Sin>(relu);
return std::make_shared<Model>(NodeVector{hswish, sin3}, ParameterVector{data0, data1});
}
} // namespace snippets
} // namespace test
} // namespace ov