[Snippets] Some optimizations (#12384)
- Implement static TileScheduler to handle compile params processing. Now compile params are accessed only here - TileScheduler should emit code only for necessary scalar/vector Tiles - Perform abstract-to-physical register mapping in one place (currently KernelEmitter constructor) - Implement more precise register mapping, so larger subgraphs could be created (now up to 12 i/o regs instead of 7) Co-authored-by: Ivan Novoselov <ivan.novoselov@intel.com>
This commit is contained in:
parent
fc27a6b49f
commit
69c514563c
@ -51,5 +51,7 @@ public:
|
||||
virtual ~Emitter() = default;
|
||||
};
|
||||
|
||||
using AllocatedEmitter = std::pair<std::shared_ptr<Emitter>, ngraph::snippets::RegInfo>;
|
||||
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -18,7 +18,7 @@ auto getRegisters(std::shared_ptr<ngraph::Node>& n) -> ngraph::snippets::RegInfo
|
||||
|
||||
/**
|
||||
* @interface TargetMachine
|
||||
* @brief Base class Target machine representation. Target derives from this class to provide generator information about supported emittors
|
||||
* @brief Base class Target machine representation. Target derives from this class to provide generator information about supported emitters
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class TargetMachine {
|
||||
@ -41,9 +41,10 @@ public:
|
||||
*/
|
||||
virtual size_t get_lanes() const = 0;
|
||||
|
||||
|
||||
/**
|
||||
* @brief called by generator to all the emittor for a target machine
|
||||
* @return a map by node's type info with callbacks to create an instance of emmitter for corresponding operation type
|
||||
* @brief called by generator to all the emitter for a target machine
|
||||
* @return a map by node's type info with callbacks to create an instance of emitter for corresponding operation type
|
||||
*/
|
||||
std::function<std::shared_ptr<Emitter>(std::shared_ptr<ngraph::Node>)> get(const ngraph::DiscreteTypeInfo type) const {
|
||||
auto jitter = jitters.find(type);
|
||||
@ -118,6 +119,12 @@ public:
|
||||
*/
|
||||
code generate(std::shared_ptr<ov::Model>& m, const void* compile_params = nullptr) const;
|
||||
|
||||
/**
|
||||
* @brief gets target machine
|
||||
* @return pointer to constant target machine
|
||||
*/
|
||||
std::shared_ptr<const TargetMachine> get_target_machine() const { return target; }
|
||||
|
||||
protected:
|
||||
std::shared_ptr<TargetMachine> target;
|
||||
};
|
||||
|
@ -1,34 +0,0 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ngraph/op/op.hpp>
|
||||
#include "load.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
|
||||
/**
|
||||
* @interface BlockedLoad
|
||||
* @brief Generated by Canonicalization step for blocked data (NCHW<X>c) to be loaded
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class BlockedLoad : public Load {
|
||||
public:
|
||||
OPENVINO_OP("BlockedLoad", "SnippetsOpset", ngraph::snippets::op::Load);
|
||||
|
||||
BlockedLoad(const Output<Node>& x);
|
||||
BlockedLoad() = default;
|
||||
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override {
|
||||
check_new_args_count(this, new_args);
|
||||
return std::make_shared<BlockedLoad>(new_args.at(0));
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -1,36 +0,0 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ngraph/op/op.hpp>
|
||||
#include <ngraph/op/parameter.hpp>
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
|
||||
/**
|
||||
* @interface BlockedParameter
|
||||
* @brief Represents blocked input (NCHW<X>c) for a subgraph
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class BlockedParameter : public ngraph::op::Parameter {
|
||||
public:
|
||||
OPENVINO_OP("BlockedParameter", "SnippetsOpset", ngraph::op::Parameter);
|
||||
|
||||
BlockedParameter() = default;
|
||||
BlockedParameter(const ngraph::element::Type& element_type, const PartialShape& pshape)
|
||||
: Parameter(element_type, pshape) {
|
||||
}
|
||||
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override {
|
||||
check_new_args_count(this, new_args);
|
||||
return std::make_shared<BlockedParameter>(m_element_type, m_partial_shape);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -0,0 +1,38 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include "ngraph/op/op.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
|
||||
/**
|
||||
* @interface ConvertSaturation
|
||||
* @brief It's a ordinary Convert op with specific rules for integer conversion.
|
||||
* The implementation uses "saturation" conversion for integer values.
|
||||
* It means that if the integer values are outside the limits
|
||||
* of the maximum and minimum values of the destination data type, they are clamped.
|
||||
* For example, int_32t ---> int8_t
|
||||
* 129 ---> 127
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class ConvertSaturation : public ov::op::v0::Convert {
|
||||
public:
|
||||
OPENVINO_OP("ConvertSaturation", "SnippetsOpset", ov::op::v0::Convert);
|
||||
|
||||
ConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type);
|
||||
ConvertSaturation() = default;
|
||||
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
|
||||
|
||||
bool has_evaluate() const override { return false; }
|
||||
};
|
||||
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -0,0 +1,37 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include "ngraph/op/op.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
|
||||
/**
|
||||
* @interface ConvertTruncation
|
||||
* @brief It's a ordinary Convert op with specific rules for integer conversion.
|
||||
* The implementation "truncation" conversion for integer values.
|
||||
* It means that if there are overflow, the integer values will wrap around.
|
||||
* For example, int_32t ---> int8_t
|
||||
* 129 ---> -127
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class ConvertTruncation : public ov::op::v0::Convert {
|
||||
public:
|
||||
OPENVINO_OP("ConvertTruncation", "SnippetsOpset", ov::op::v0::Convert);
|
||||
|
||||
ConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type);
|
||||
ConvertTruncation() = default;
|
||||
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
|
||||
|
||||
bool has_evaluate() const override { return false; }
|
||||
};
|
||||
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -12,20 +12,22 @@ namespace op {
|
||||
|
||||
/**
|
||||
* @interface Load
|
||||
* @brief Generated by Canonicalization step where explicit load instruction should be emmiteed
|
||||
* ScalarLoad == scalar instruction + post increment
|
||||
* Load (VectorLoad) == vector instruction + post increment
|
||||
* BroadcastLoad == scalar instruction - post increment
|
||||
* BlockedLoad == vector instruction - post increment
|
||||
* @brief Generated by Canonicalization step where explicit instructions should be emitted for data loading
|
||||
* where number of elements to load is determined by "count"
|
||||
* Default value is "1" - to load one element
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class Load : public ngraph::op::Op {
|
||||
public:
|
||||
OPENVINO_OP("Load", "SnippetsOpset");
|
||||
|
||||
Load(const Output<Node>& x);
|
||||
Load(const Output<Node>& x, const size_t count = 1lu);
|
||||
Load() = default;
|
||||
|
||||
size_t get_count() const { return m_count; }
|
||||
|
||||
void set_count(const size_t count) { m_count = count; }
|
||||
|
||||
bool visit_attributes(AttributeVisitor& visitor) override;
|
||||
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
|
||||
@ -35,6 +37,9 @@ public:
|
||||
OPENVINO_SUPPRESS_DEPRECATED_START
|
||||
bool evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const override;
|
||||
OPENVINO_SUPPRESS_DEPRECATED_END
|
||||
|
||||
protected:
|
||||
size_t m_count = 0lu;
|
||||
};
|
||||
|
||||
} // namespace op
|
||||
|
@ -1,34 +0,0 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ngraph/op/op.hpp>
|
||||
#include "load.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
|
||||
/**
|
||||
* @interface ScalarLoad
|
||||
* @brief Generated by Canonicalization for a scalar value load to vector register
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class ScalarLoad : public Load {
|
||||
public:
|
||||
OPENVINO_OP("ScalarLoad", "SnippetsOpset", ngraph::snippets::op::Load);
|
||||
|
||||
ScalarLoad(const Output<Node>& x);
|
||||
ScalarLoad() = default;
|
||||
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override {
|
||||
check_new_args_count(this, new_args);
|
||||
return std::make_shared<ScalarLoad>(new_args.at(0));
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -1,34 +0,0 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ngraph/op/op.hpp>
|
||||
#include "store.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
|
||||
/**
|
||||
* @interface ScalarStore
|
||||
* @brief Generated by Canonicalization for a scalar value store from vector register
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class ScalarStore : public Store {
|
||||
public:
|
||||
OPENVINO_OP("ScalarStore", "SnippetsOpset", ngraph::snippets::op::Store);
|
||||
|
||||
ScalarStore(const Output<Node>& x);
|
||||
ScalarStore() = default;
|
||||
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override {
|
||||
check_new_args_count(this, new_args);
|
||||
return std::make_shared<ScalarStore>(new_args.at(0));
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -11,17 +11,23 @@ namespace snippets {
|
||||
namespace op {
|
||||
|
||||
/**
|
||||
* @interface Load
|
||||
* @brief Generated by Canonicalization step where explicit store instruction should be emmiteed
|
||||
* @interface Store
|
||||
* @brief Generated by Canonicalization step where explicit instructions should be emitted for data storing
|
||||
* where number of elements to store is determined by "count"
|
||||
* Default value is "1" - to store one element
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class Store : public ngraph::op::Op {
|
||||
public:
|
||||
OPENVINO_OP("Store", "SnippetsOpset");
|
||||
|
||||
Store(const Output<Node>& x);
|
||||
Store(const Output<Node>& x, const size_t count = 1lu);
|
||||
Store() = default;
|
||||
|
||||
size_t get_count() const { return m_count; }
|
||||
|
||||
void set_count(const size_t count) { m_count = count; }
|
||||
|
||||
bool visit_attributes(AttributeVisitor& visitor) override;
|
||||
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
|
||||
@ -31,6 +37,9 @@ public:
|
||||
OPENVINO_SUPPRESS_DEPRECATED_START
|
||||
bool evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const override;
|
||||
OPENVINO_SUPPRESS_DEPRECATED_END
|
||||
|
||||
protected:
|
||||
size_t m_count = 0lu;
|
||||
};
|
||||
|
||||
} // namespace op
|
||||
|
@ -89,10 +89,9 @@ public:
|
||||
}
|
||||
|
||||
|
||||
snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes,
|
||||
ngraph::pass::Manager& opt, const void* compile_params = nullptr);
|
||||
snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes,
|
||||
snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes, ngraph::pass::Manager& opt,
|
||||
const void* compile_params = nullptr);
|
||||
snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes, const void* compile_params = nullptr);
|
||||
snippets::Schedule generate(ngraph::pass::Manager &opt, const void* compile_params = nullptr);
|
||||
snippets::Schedule generate(const void* compile_params = nullptr);
|
||||
Shape canonicalize(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes);
|
||||
@ -107,8 +106,10 @@ public:
|
||||
void serialize() const;
|
||||
|
||||
static auto wrap_node_as_subgraph(const std::shared_ptr<ngraph::Node>& node) -> std::shared_ptr<Subgraph>;
|
||||
static void fill_empty_output_names(const Output<Node>& target_output_node, const Output<Node>& replacement_output_node);
|
||||
|
||||
private:
|
||||
void align_element_types(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes);
|
||||
void convert_to_snippet_dialect();
|
||||
Shape exec_domain;
|
||||
std::shared_ptr<ov::Model> m_body;
|
||||
|
@ -20,14 +20,27 @@ class Tile : public ngraph::op::Op {
|
||||
public:
|
||||
OPENVINO_OP("Tile", "SnippetsOpset");
|
||||
|
||||
Tile(const std::vector<std::pair<std::shared_ptr<ngraph::snippets::Emitter>, ngraph::snippets::RegInfo>>& region);
|
||||
/// \brief Construct an Tile
|
||||
/// \param region The vector of pairs: emitters and the corresponding registers
|
||||
/// \param increment Tile size - count of elements to load and store.
|
||||
/// Vector Tile should have size of vector register and Scalar Tile should have 1
|
||||
/// \param num_inputs Count of inputs
|
||||
/// \param num_outputs Count of outputs
|
||||
/// \param io_dims Vector of last dimensions of inputs and outputs
|
||||
/// \param io_data_sizes Vector of data type sizes of inputs and outputs
|
||||
Tile(const std::vector<AllocatedEmitter>& region, size_t increment, size_t num_inputs, size_t num_outputs,
|
||||
const std::vector<size_t>& io_dims, const std::vector<size_t>& io_data_sizes);
|
||||
Tile() = default;
|
||||
std::vector<std::pair<std::shared_ptr<ngraph::snippets::Emitter>, ngraph::snippets::RegInfo>> region;
|
||||
std::vector<AllocatedEmitter> region;
|
||||
size_t increment = 0;
|
||||
size_t num_inputs = 0;
|
||||
size_t num_outputs = 0;
|
||||
std::vector<size_t> io_dims {};
|
||||
std::vector<size_t> io_data_size {};
|
||||
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override {
|
||||
return std::make_shared<Tile>(region);
|
||||
return std::make_shared<Tile>(region, increment, num_inputs, num_outputs, io_dims, io_data_size);
|
||||
}
|
||||
const void *compile_params;
|
||||
};
|
||||
|
||||
} // namespace op
|
||||
|
39
src/common/snippets/include/snippets/op/tile_scheduler.hpp
Normal file
39
src/common/snippets/include/snippets/op/tile_scheduler.hpp
Normal file
@ -0,0 +1,39 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ngraph/op/op.hpp"
|
||||
#include "snippets/emitter.hpp"
|
||||
#include "tile.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
|
||||
/**
|
||||
* @interface TileScheduler
|
||||
* @brief Contains a set of Tiles (currently one vector and one scalar) and performs necessary preparations
|
||||
* before the Tiles could be executed: calculates offsets, sets proper work amounts, decrement pointers if the same data
|
||||
* have to be read several times (broadcasting).
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class TileScheduler : public ngraph::op::Op {
|
||||
public:
|
||||
OPENVINO_OP("TileScheduler", "SnippetsOpset");
|
||||
|
||||
TileScheduler(const AllocatedEmitter& vector_region, const AllocatedEmitter& scalar_region);
|
||||
TileScheduler() = default;
|
||||
AllocatedEmitter vector_region;
|
||||
AllocatedEmitter scalar_region;
|
||||
// todo: this clone_with_new_inputs is irrelevant
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override {
|
||||
return std::make_shared<TileScheduler>(vector_region, scalar_region);
|
||||
}
|
||||
const void *compile_params;
|
||||
};
|
||||
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -1,34 +0,0 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ngraph/op/op.hpp>
|
||||
#include "load.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
|
||||
/**
|
||||
* @interface VectorLoad
|
||||
* @brief Generated by Canonicalization for a vector value load to vector register
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class VectorLoad : public Load {
|
||||
public:
|
||||
OPENVINO_OP("VectorLoad", "SnippetsOpset", ngraph::snippets::op::Load);
|
||||
|
||||
VectorLoad(const Output<Node>& x);
|
||||
VectorLoad() = default;
|
||||
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override {
|
||||
check_new_args_count(this, new_args);
|
||||
return std::make_shared<VectorLoad>(new_args.at(0));
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -1,34 +0,0 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ngraph/op/op.hpp>
|
||||
#include "store.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
|
||||
/**
|
||||
* @interface VectorStore
|
||||
* @brief Generated by Canonicalization for a vector value store from vector register
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class VectorStore : public Store {
|
||||
public:
|
||||
OPENVINO_OP("VectorStore", "SnippetsOpset", ngraph::snippets::op::Store);
|
||||
|
||||
VectorStore(const Output<Node>& x);
|
||||
VectorStore() = default;
|
||||
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override {
|
||||
check_new_args_count(this, new_args);
|
||||
return std::make_shared<VectorStore>(new_args.at(0));
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -18,7 +18,7 @@ namespace pass {
|
||||
*/
|
||||
class AssignRegisters : public ngraph::pass::FunctionPass {
|
||||
public:
|
||||
AssignRegisters() {
|
||||
explicit AssignRegisters() {
|
||||
set_property(ngraph::pass::PassProperty::REQUIRE_STATIC_SHAPE, true);
|
||||
}
|
||||
bool run_on_model(const std::shared_ptr<ov::Model>& m) override;
|
||||
|
@ -0,0 +1,31 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ngraph/pass/graph_rewrite.hpp>
|
||||
#include <ngraph/pattern/matcher.hpp>
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace pass {
|
||||
|
||||
/**
|
||||
* @interface InsertConvertOnInputs
|
||||
* @brief Inserts ConvertSaturation op after Parameters and Scalars to convert data type of inputs
|
||||
* to supported execution data type.
|
||||
* Note: ConvertSaturation op isn't covered by specification of "Convert" op
|
||||
* This op is used for conversion into and from FP32 after the correspoding Load
|
||||
* and before Store to calculate in FP32 inside subgraph body in CPU Plugin
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class InsertConvertOnInputs: public ngraph::pass::MatcherPass {
|
||||
public:
|
||||
InsertConvertOnInputs(const ov::element::Type exec_type = ov::element::f32);
|
||||
};
|
||||
|
||||
|
||||
} // namespace pass
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -19,7 +19,7 @@ namespace pass {
|
||||
*/
|
||||
class InsertLoad: public ngraph::pass::MatcherPass {
|
||||
public:
|
||||
InsertLoad();
|
||||
InsertLoad(const size_t count = 1lu);
|
||||
};
|
||||
|
||||
/**
|
||||
@ -30,7 +30,7 @@ public:
|
||||
*/
|
||||
class InsertStore: public ngraph::pass::MatcherPass {
|
||||
public:
|
||||
InsertStore();
|
||||
InsertStore(const size_t count = 1lu);
|
||||
};
|
||||
|
||||
|
||||
|
@ -0,0 +1,31 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ngraph/pass/graph_rewrite.hpp>
|
||||
#include <ngraph/pattern/matcher.hpp>
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace pass {
|
||||
|
||||
/**
|
||||
* @interface ResetTypeRelaxedNodePrecision
|
||||
* @brief Reset precision for type relaxed nodes inside body to align precision between nodes.
|
||||
* Should be called after all Convert insertions
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class ResetTypeRelaxedNodePrecision: public ngraph::pass::FunctionPass {
|
||||
public:
|
||||
OPENVINO_RTTI("ResetTypeRelaxedNodePrecision", "0");
|
||||
ResetTypeRelaxedNodePrecision(const ov::element::Type exec_type = ov::element::f32);
|
||||
bool run_on_model(const std::shared_ptr<ngraph::Function>& m) override;
|
||||
private:
|
||||
ov::element::Type exec_type;
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -0,0 +1,28 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ngraph/pass/graph_rewrite.hpp>
|
||||
#include <ngraph/pattern/matcher.hpp>
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace pass {
|
||||
|
||||
/**
|
||||
* @interface TransofrmConvertToConvertTruncation
|
||||
* @brief Transform Convert to ConvertTruncation with specification conversion rules
|
||||
* Note: ConvertTruncation op is covered by specification of "Convert" op
|
||||
* This op is used for real Convert ops inside subgraph body in CPU Plugin
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class TransformConvertToConvertTruncation: public ngraph::pass::MatcherPass {
|
||||
public:
|
||||
TransformConvertToConvertTruncation();
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -12,27 +12,27 @@ namespace snippets {
|
||||
namespace pass {
|
||||
|
||||
/**
|
||||
* @interface ReplaceLoadsWithScalarLoads
|
||||
* @brief Replaces vector loads with scalar versions.
|
||||
* The pass is used to cange element type of function in a canonical form vector to scalar.
|
||||
* @interface SetScalarCountForLoad
|
||||
* @brief Set count `1` for Load to represent as ScalarLoad
|
||||
* The pass is used to change element count to loading to "1" to load scalar value
|
||||
* Used for tail generation
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class ReplaceLoadsWithScalarLoads: public ngraph::pass::MatcherPass {
|
||||
class SetScalarCountForLoad: public ngraph::pass::MatcherPass {
|
||||
public:
|
||||
ReplaceLoadsWithScalarLoads();
|
||||
SetScalarCountForLoad();
|
||||
};
|
||||
|
||||
/**
|
||||
* @interface ReplaceStoresWithScalarStores
|
||||
* @brief Replaces vector stores with scalar versions.
|
||||
* The pass is used to cange element type of model in a canonical form vector to scalar.
|
||||
* @interface SetScalarCountForStore
|
||||
* @brief Set count `1` for Store to represent as ScalarStore
|
||||
* The pass is used to change element count to stroring to "1" to store scalar valuw
|
||||
* Used for tail generation
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class ReplaceStoresWithScalarStores: public ngraph::pass::MatcherPass {
|
||||
class SetScalarCountForStore: public ngraph::pass::MatcherPass {
|
||||
public:
|
||||
ReplaceStoresWithScalarStores();
|
||||
SetScalarCountForStore();
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
|
@ -7,21 +7,18 @@
|
||||
#include "ngraph/ops.hpp"
|
||||
#include <ngraph/opsets/opset1.hpp>
|
||||
|
||||
#include "op/blockedload.hpp"
|
||||
#include "op/blockedparameter.hpp"
|
||||
#include "op/broadcastload.hpp"
|
||||
#include "op/broadcastmove.hpp"
|
||||
#include "op/convert_saturation.hpp"
|
||||
#include "op/convert_truncation.hpp"
|
||||
#include "op/kernel.hpp"
|
||||
#include "op/load.hpp"
|
||||
#include "op/nop.hpp"
|
||||
#include "op/scalar.hpp"
|
||||
#include "op/scalarload.hpp"
|
||||
#include "op/scalarstore.hpp"
|
||||
#include "op/powerstatic.hpp"
|
||||
#include "op/store.hpp"
|
||||
#include "op/tile.hpp"
|
||||
#include "op/vectorload.hpp"
|
||||
#include "op/vectorstore.hpp"
|
||||
#include "op/tile_scheduler.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
|
@ -11,14 +11,9 @@
|
||||
|
||||
// SnippetS dialect
|
||||
NGRAPH_OP(Load, ngraph::snippets::op)
|
||||
NGRAPH_OP(ScalarLoad, ngraph::snippets::op)
|
||||
NGRAPH_OP(VectorLoad, ngraph::snippets::op)
|
||||
NGRAPH_OP(BlockedLoad, ngraph::snippets::op)
|
||||
NGRAPH_OP(BroadcastLoad, ngraph::snippets::op)
|
||||
|
||||
NGRAPH_OP(Store, ngraph::snippets::op)
|
||||
NGRAPH_OP(ScalarStore, ngraph::snippets::op)
|
||||
NGRAPH_OP(VectorStore, ngraph::snippets::op)
|
||||
|
||||
NGRAPH_OP(BroadcastMove, ngraph::snippets::op)
|
||||
NGRAPH_OP(Scalar, ngraph::snippets::op)
|
||||
@ -29,9 +24,10 @@ NGRAPH_OP(Nop, ngraph::snippets::op)
|
||||
// opset completeness
|
||||
NGRAPH_OP(Constant, ngraph::op)
|
||||
NGRAPH_OP(Parameter, ngraph::op::v0)
|
||||
NGRAPH_OP(BlockedParameter, ngraph::snippets::op)
|
||||
NGRAPH_OP(Result, ngraph::op::v0)
|
||||
NGRAPH_OP(Broadcast, ngraph::op::v1)
|
||||
NGRAPH_OP(ConvertTruncation, ngraph::snippets::op)
|
||||
NGRAPH_OP(ConvertSaturation, ngraph::snippets::op)
|
||||
|
||||
// unary
|
||||
NGRAPH_OP(Abs, ngraph::op::v0)
|
||||
|
@ -17,7 +17,8 @@ auto ngraph::snippets::getRegisters(std::shared_ptr<ngraph::Node>& n) -> ngraph:
|
||||
auto rt = n->get_rt_info();
|
||||
|
||||
// ToDo: change to reg_t
|
||||
std::vector<size_t> rout;
|
||||
std::vector<size_t> rin, rout;
|
||||
|
||||
auto it_rt = rt.find("reginfo");
|
||||
if (it_rt != rt.end()) {
|
||||
for (auto reg : it_rt->second.as<std::vector<size_t>>()) {
|
||||
@ -25,12 +26,11 @@ auto ngraph::snippets::getRegisters(std::shared_ptr<ngraph::Node>& n) -> ngraph:
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<size_t> rin;
|
||||
for (auto input : n->inputs()) {
|
||||
for (const auto& input : n->inputs()) {
|
||||
auto rt = input.get_source_output().get_node_shared_ptr()->get_rt_info();
|
||||
auto it_rt = rt.find("reginfo");
|
||||
if (it_rt != rt.end()) {
|
||||
for (auto reg : it_rt->second.as<std::vector<size_t>>()) {
|
||||
for (auto& reg : it_rt->second.as<std::vector<size_t>>()) {
|
||||
rin.push_back(reg);
|
||||
}
|
||||
}
|
||||
@ -48,51 +48,56 @@ ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptr<ov:
|
||||
auto results = m->get_results();
|
||||
auto in = params.size();
|
||||
auto out = results.size();
|
||||
auto nptrs = in + out;
|
||||
std::vector<size_t> io_last_dims(in + out);
|
||||
std::vector<size_t> io_data_sizes(in + out);
|
||||
std::transform(params.begin(), params.end(), io_last_dims.begin(),
|
||||
[](const std::shared_ptr<Node>& n){return n->get_output_shape(0).back();});
|
||||
std::transform(results.begin(), results.end(), io_last_dims.begin() + in,
|
||||
[](const std::shared_ptr<Node>& n){return n->get_input_shape(0).back();});
|
||||
std::transform(params.begin(), params.end(), io_data_sizes.begin(),
|
||||
[](const std::shared_ptr<Node>& n){return n->get_element_type().size();});
|
||||
std::transform(results.begin(), results.end(), io_data_sizes.begin() + in,
|
||||
[](const std::shared_ptr<Node>& n){return n->get_element_type().size();});
|
||||
|
||||
OV_ITT_TASK_CHAIN(GENERATE, ngraph::pass::itt::domains::SnippetsTransform, "Snippets::Generator", "::VectorTile")
|
||||
// vector tile
|
||||
std::vector<std::pair<std::shared_ptr<ngraph::snippets::Emitter>, ngraph::snippets::RegInfo>> lowered;
|
||||
std::vector<AllocatedEmitter> lowered;
|
||||
for (auto n : m->get_ordered_ops()) {
|
||||
lowered.push_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n)));
|
||||
lowered.emplace_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n)));
|
||||
}
|
||||
OV_ITT_TASK_NEXT(GENERATE, "::ScalarTile")
|
||||
|
||||
// scalar tile
|
||||
auto m_scalar = ov::clone_model(*m.get());
|
||||
ngraph::pass::Manager mng;
|
||||
mng.register_pass<ngraph::snippets::pass::ReplaceLoadsWithScalarLoads>();
|
||||
mng.register_pass<ngraph::snippets::pass::ReplaceStoresWithScalarStores>();
|
||||
mng.register_pass<ngraph::snippets::pass::SetScalarCountForLoad>();
|
||||
mng.register_pass<ngraph::snippets::pass::SetScalarCountForStore>();
|
||||
mng.run_passes(m_scalar);
|
||||
OV_ITT_TASK_NEXT(GENERATE, "::ScalarTile_get")
|
||||
std::vector<std::pair<std::shared_ptr<Emitter>, RegInfo>> scalar_lowered;
|
||||
std::vector<AllocatedEmitter> scalar_lowered;
|
||||
for (auto n : m_scalar->get_ordered_ops()) {
|
||||
scalar_lowered.push_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n)));
|
||||
scalar_lowered.emplace_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n)));
|
||||
}
|
||||
OV_ITT_TASK_NEXT(GENERATE, "::Tiles1D")
|
||||
|
||||
OV_ITT_TASK_NEXT(GENERATE, "::Tiles1D");
|
||||
// wrapping into tiles1D
|
||||
std::vector<std::pair<std::shared_ptr<Emitter>, RegInfo>> tiles1D;
|
||||
auto tile = std::make_shared<ngraph::snippets::op::Tile>(lowered);
|
||||
tile->compile_params = compile_params;
|
||||
tiles1D.push_back(std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(tile),
|
||||
std::make_pair(std::vector<size_t>({target->get_lanes(), 0, nptrs, 1}), std::vector<size_t>{})));
|
||||
tile = std::make_shared<ngraph::snippets::op::Tile>(scalar_lowered);
|
||||
tile->compile_params = compile_params;
|
||||
tiles1D.push_back(std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(tile),
|
||||
std::make_pair(std::vector<size_t>{{1, target->get_lanes(), nptrs, 1}}, std::vector<size_t>{})));
|
||||
//todo: in, out, and io_last_dims should derive naturally from the graph representation
|
||||
const auto& vector_tile = std::make_shared<ngraph::snippets::op::Tile>(lowered, target->get_lanes(), in, out, io_last_dims, io_data_sizes);
|
||||
const auto& vector_region = std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(vector_tile),
|
||||
std::make_pair(std::vector<size_t>{}, std::vector<size_t>{}));
|
||||
const auto& scalar_tile = std::make_shared<ngraph::snippets::op::Tile>(scalar_lowered, 1, in, out, io_last_dims, io_data_sizes);
|
||||
const auto& scalar_region = std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(scalar_tile),
|
||||
std::make_pair(std::vector<size_t>{}, std::vector<size_t>{}));
|
||||
|
||||
OV_ITT_TASK_NEXT(GENERATE, "::Tiles2D")
|
||||
// wrapping into tiles2D
|
||||
std::vector<std::pair<std::shared_ptr<Emitter>, RegInfo>> tiles2D;
|
||||
tile = std::make_shared<ngraph::snippets::op::Tile>(tiles1D);
|
||||
tile->compile_params = compile_params;
|
||||
tiles2D.push_back(std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(tile),
|
||||
std::make_pair(std::vector<size_t>({1, 0, nptrs, 0}), std::vector<size_t>{})));
|
||||
auto tile_scheduler = std::make_shared<ngraph::snippets::op::TileScheduler>(vector_region, scalar_region);
|
||||
tile_scheduler->compile_params = compile_params;
|
||||
const auto& tile_scheduler_region = std::make_pair(target->get(ngraph::snippets::op::TileScheduler::get_type_info_static())(tile_scheduler),
|
||||
std::make_pair(std::vector<size_t>({in, out, target->get_lanes()}), std::vector<size_t>{}));
|
||||
|
||||
OV_ITT_TASK_NEXT(GENERATE, "::EmitCode")
|
||||
// emission
|
||||
auto tiles2DKernel = std::make_shared<ngraph::snippets::op::Kernel>(tiles2D);
|
||||
auto tiles2DKernel = std::make_shared<ngraph::snippets::op::Kernel>(std::vector<AllocatedEmitter> {tile_scheduler_region});
|
||||
tiles2DKernel->compile_params = compile_params;
|
||||
std::shared_ptr<Emitter> kernel = target->get(ngraph::snippets::op::Kernel::get_type_info_static())(tiles2DKernel);
|
||||
kernel->emit_code({in, out}, {});
|
||||
|
@ -1,10 +0,0 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/op/blockedload.hpp"
|
||||
|
||||
using namespace ngraph;
|
||||
|
||||
snippets::op::BlockedLoad::BlockedLoad(const Output<Node>& x) : Load(x) {
|
||||
}
|
19
src/common/snippets/src/op/convert_saturation.cpp
Normal file
19
src/common/snippets/src/op/convert_saturation.cpp
Normal file
@ -0,0 +1,19 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/itt.hpp"
|
||||
|
||||
#include "snippets/op/convert_saturation.hpp"
|
||||
|
||||
#include "ngraph/runtime/host_tensor.hpp"
|
||||
|
||||
ngraph::snippets::op::ConvertSaturation::ConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type)
|
||||
: ov::op::v0::Convert({x}, destination_type) {
|
||||
}
|
||||
|
||||
std::shared_ptr<ngraph::Node> ngraph::snippets::op::ConvertSaturation::clone_with_new_inputs(const OutputVector& new_args) const {
|
||||
INTERNAL_OP_SCOPE(ConvertSaturation_clone_with_new_inputs);
|
||||
check_new_args_count(this, new_args);
|
||||
return std::make_shared<ConvertSaturation>(new_args.at(0), m_destination_type);
|
||||
}
|
19
src/common/snippets/src/op/convert_truncation.cpp
Normal file
19
src/common/snippets/src/op/convert_truncation.cpp
Normal file
@ -0,0 +1,19 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/itt.hpp"
|
||||
|
||||
#include "snippets/op/convert_truncation.hpp"
|
||||
|
||||
#include "ngraph/runtime/host_tensor.hpp"
|
||||
|
||||
ngraph::snippets::op::ConvertTruncation::ConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type)
|
||||
: ov::op::v0::Convert({x}, destination_type) {
|
||||
}
|
||||
|
||||
std::shared_ptr<ngraph::Node> ngraph::snippets::op::ConvertTruncation::clone_with_new_inputs(const OutputVector& new_args) const {
|
||||
INTERNAL_OP_SCOPE(ConvertTruncation_clone_with_new_inputs);
|
||||
check_new_args_count(this, new_args);
|
||||
return std::make_shared<ConvertTruncation>(new_args.at(0), m_destination_type);
|
||||
}
|
@ -11,7 +11,7 @@
|
||||
using namespace std;
|
||||
using namespace ngraph;
|
||||
|
||||
snippets::op::Load::Load(const Output<Node>& x) : Op({x}) {
|
||||
snippets::op::Load::Load(const Output<Node>& x, const size_t count) : Op({x}), m_count(count) {
|
||||
constructor_validate_and_infer_types();
|
||||
}
|
||||
|
||||
@ -22,7 +22,7 @@ bool snippets::op::Load::visit_attributes(AttributeVisitor& visitor) {
|
||||
std::shared_ptr<Node> snippets::op::Load::clone_with_new_inputs(const OutputVector& new_args) const {
|
||||
INTERNAL_OP_SCOPE(Load);
|
||||
check_new_args_count(this, new_args);
|
||||
return std::make_shared<Load>(new_args.at(0));
|
||||
return std::make_shared<Load>(new_args.at(0), m_count);
|
||||
}
|
||||
|
||||
void snippets::op::Load::validate_and_infer_types() {
|
||||
|
@ -1,10 +0,0 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/op/scalarload.hpp"
|
||||
|
||||
using namespace ngraph;
|
||||
|
||||
snippets::op::ScalarLoad::ScalarLoad(const Output<Node>& x) : Load(x) {
|
||||
}
|
@ -1,10 +0,0 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/op/scalarstore.hpp"
|
||||
|
||||
using namespace ngraph;
|
||||
|
||||
snippets::op::ScalarStore::ScalarStore(const Output<Node>& x) : Store(x) {
|
||||
}
|
@ -4,14 +4,14 @@
|
||||
|
||||
#include <snippets/itt.hpp>
|
||||
|
||||
#include "snippets/op/scalarstore.hpp"
|
||||
#include "snippets/op/store.hpp"
|
||||
|
||||
#include <ngraph/runtime/host_tensor.hpp>
|
||||
|
||||
using namespace std;
|
||||
using namespace ngraph;
|
||||
|
||||
snippets::op::Store::Store(const Output<Node>& x) : Op({x}) {
|
||||
snippets::op::Store::Store(const Output<Node>& x, const size_t count) : Op({x}), m_count(count) {
|
||||
constructor_validate_and_infer_types();
|
||||
}
|
||||
|
||||
@ -22,7 +22,7 @@ bool snippets::op::Store::visit_attributes(AttributeVisitor& visitor) {
|
||||
std::shared_ptr<Node> snippets::op::Store::clone_with_new_inputs(const OutputVector& new_args) const {
|
||||
INTERNAL_OP_SCOPE(Store);
|
||||
check_new_args_count(this, new_args);
|
||||
return std::make_shared<Store>(new_args.at(0));
|
||||
return std::make_shared<Store>(new_args.at(0), m_count);
|
||||
}
|
||||
|
||||
void snippets::op::Store::validate_and_infer_types() {
|
||||
|
@ -6,6 +6,7 @@
|
||||
#include "snippets/remarks.hpp"
|
||||
|
||||
#include "snippets/op/subgraph.hpp"
|
||||
#include "snippets/op/convert_saturation.hpp"
|
||||
#include "snippets/pass/insert_load_store.hpp"
|
||||
#include "snippets/pass/insert_movebroadcast.hpp"
|
||||
#include "snippets/pass/load_movebroadcast_to_broadcastload.hpp"
|
||||
@ -13,8 +14,15 @@
|
||||
#include "snippets/pass/convert_constants_to_scalars.hpp"
|
||||
#include "snippets/pass/convert_power_to_powerstatic.hpp"
|
||||
#include "snippets/pass/vector_to_scalar.hpp"
|
||||
#include "snippets/pass/transform_convert_to_truncation.hpp"
|
||||
#include "snippets/pass/insert_convert_on_inputs.hpp"
|
||||
#include "snippets/pass/reset_type_relaxed_node_precision.hpp"
|
||||
|
||||
#include "transformations/common_optimizations/nop_elimination.hpp"
|
||||
#include "transformations/utils/utils.hpp"
|
||||
|
||||
#include <ngraph/pass/manager.hpp>
|
||||
#include "ngraph/pass/constant_folding.hpp"
|
||||
#include <openvino/pass/serialize.hpp>
|
||||
|
||||
#include <algorithm>
|
||||
@ -92,6 +100,9 @@ auto snippets::op::Subgraph::wrap_node_as_subgraph(const std::shared_ptr<ov::Nod
|
||||
|
||||
auto body_node = node->clone_with_new_inputs(body_inputs);
|
||||
body_node->set_friendly_name(node->get_friendly_name());
|
||||
for (size_t i = 0; i < node->get_output_size(); i++) {
|
||||
fill_empty_output_names(body_node->output(i), node->output(i));
|
||||
}
|
||||
|
||||
if (node->get_output_size() != body_node->get_output_size()) {
|
||||
throw ngraph::ngraph_error("original node outputs size and extracted subgraph node outputs size doesn't much");
|
||||
@ -118,6 +129,20 @@ auto snippets::op::Subgraph::wrap_node_as_subgraph(const std::shared_ptr<ov::Nod
|
||||
|
||||
return subgraph;
|
||||
}
|
||||
|
||||
void snippets::op::Subgraph::fill_empty_output_names(const Output<Node>& target_output_node, const Output<Node>& replacement_output_node) {
|
||||
NGRAPH_SUPPRESS_DEPRECATED_START
|
||||
auto out_tensor = target_output_node.get_tensor_ptr();
|
||||
const std::string new_name = ngraph::op::util::get_ie_output_name(replacement_output_node);
|
||||
if (out_tensor->get_name().empty()) {
|
||||
out_tensor->set_name(new_name);
|
||||
}
|
||||
if (!replacement_output_node.get_names().empty()) {
|
||||
out_tensor->set_names(replacement_output_node.get_names());
|
||||
}
|
||||
NGRAPH_SUPPRESS_DEPRECATED_END
|
||||
}
|
||||
|
||||
///
|
||||
/// \brief Canonization transforms original subgraph and to canonical form suitable for code generation. In particular,
|
||||
/// it handles supported layout conversions, broadcasts inputs and outputs to a single rank and layout. Canonicalization
|
||||
@ -125,6 +150,7 @@ auto snippets::op::Subgraph::wrap_node_as_subgraph(const std::shared_ptr<ov::Nod
|
||||
/// Canonicalization currently supports only the following layout conversions:
|
||||
/// * None: all inputs have the same layout
|
||||
/// * Planar + blocked: some inputs have blocked, and some have planar layouts, e.g. <N, C, H, W, c> + <N, C, H, W>
|
||||
/// Also there is precision aligning inside body of subgraph during canonicalization
|
||||
Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes) {
|
||||
INTERNAL_OP_SCOPE(Subgraph);
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::canonicalize")
|
||||
@ -176,7 +202,8 @@ Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShape
|
||||
PartialShape::broadcast_merge_into(tmpPShape, inShape, ::ngraph::op::AutoBroadcastType::NUMPY),
|
||||
"Failed to create broadcastable shapes in snippets canonicalization");
|
||||
const auto paramShape = m_body->get_parameters()[i]->get_shape();
|
||||
if (paramShape.size() != inShape.size() || !equal(paramShape.begin(), paramShape.end(), inShape.begin()))
|
||||
const auto paramType = m_body->get_parameters()[i]->get_element_type();
|
||||
if (paramShape.size() != inShape.size() || !equal(paramShape.begin(), paramShape.end(), inShape.begin()) || paramType != inType)
|
||||
m_body->replace_parameter(i, std::make_shared<opset1::Parameter>(inType, inShape));
|
||||
}
|
||||
|
||||
@ -213,21 +240,80 @@ Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShape
|
||||
::ngraph::op::AutoBroadcastType::NUMPY);
|
||||
NODE_VALIDATION_CHECK(this, compatibleWithOtherOutputs, "Snippets output shapes must be numpy broadcastable");
|
||||
}
|
||||
|
||||
// We should insert Converts after Parameters and Constant and before Results
|
||||
// to align precision inside Subgraph body that is supported by Plugin
|
||||
align_element_types(outputShapes, inputShapes);
|
||||
|
||||
exec_domain = outPShape.get_shape();
|
||||
return exec_domain;
|
||||
}
|
||||
|
||||
void snippets::op::Subgraph::align_element_types(const BlockedShapeVector& outputShapes,
|
||||
const BlockedShapeVector& inputShapes) {
|
||||
// TODO: At the moment snippets support execution in only one element type
|
||||
const auto execution_element_type = ov::element::f32;
|
||||
|
||||
ngraph::pass::Manager p_manager;
|
||||
p_manager.register_pass<snippets::pass::TransformConvertToConvertTruncation>();
|
||||
p_manager.run_passes(m_body);
|
||||
|
||||
const auto& body_results = m_body->get_results();
|
||||
for (size_t i = 0; i < outputShapes.size(); i++) {
|
||||
const auto needed_out_type = std::get<2>(outputShapes[i]);
|
||||
|
||||
// If there is real Convert from graph (ConvertTruncation) before Result
|
||||
// we should check destination type and insert ConvertSaturation before that if needed.
|
||||
// For example, to return original element type after Convert insertion on inputs
|
||||
std::shared_ptr<ov::Node> first_convert = body_results[i];
|
||||
while (ov::is_type<ngraph::snippets::op::ConvertTruncation>(first_convert->get_input_node_ptr(0))) {
|
||||
first_convert = first_convert->get_input_node_shared_ptr(0);
|
||||
}
|
||||
if (auto existing_convert_t = ngraph::as_type_ptr<ngraph::snippets::op::ConvertTruncation>(first_convert)) {
|
||||
const auto original_input_element_type = existing_convert_t->get_input_element_type(0);
|
||||
if (original_input_element_type != execution_element_type) {
|
||||
const auto convert = std::make_shared<ngraph::snippets::op::ConvertSaturation>(
|
||||
existing_convert_t->get_input_node_shared_ptr(0), original_input_element_type);
|
||||
existing_convert_t->set_argument(0, convert);
|
||||
}
|
||||
}
|
||||
|
||||
// We should insert Convert before Results to return original output element type
|
||||
const auto convert = std::make_shared<ngraph::snippets::op::ConvertSaturation>(
|
||||
body_results[i]->get_input_node_shared_ptr(0), needed_out_type);
|
||||
body_results[i]->set_argument(0, convert);
|
||||
}
|
||||
|
||||
// After Convert insertion we should make the following steps:
|
||||
// - insert ConvertSaturation after inputs and scalar to start aligning of exec data type inside body
|
||||
// - manually set output element types of type relaxed nodes to align element type inside subgraph body
|
||||
// - after Convert insertion on inputs and after scalars we should use ConstantFolding pass to convert
|
||||
// element type of Scalars before inference
|
||||
// - eliminate redundant Convert that could have been inserted
|
||||
ngraph::pass::Manager manager;
|
||||
manager.register_pass<snippets::pass::InsertConvertOnInputs>(execution_element_type);
|
||||
manager.register_pass<snippets::pass::ResetTypeRelaxedNodePrecision>(execution_element_type);
|
||||
manager.register_pass<ngraph::pass::ConstantFolding>();
|
||||
manager.register_pass<ngraph::pass::EliminateConvert>();
|
||||
manager.run_passes(m_body);
|
||||
}
|
||||
|
||||
void snippets::op::Subgraph::convert_to_snippet_dialect() {
|
||||
INTERNAL_OP_SCOPE(Subgraph);
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::convert_to_snippet_dialect")
|
||||
auto skip_matching_domain = [](const std::shared_ptr<const ov::Node>& n) -> bool {
|
||||
return n->get_input_shape(0).back() != 1;
|
||||
};
|
||||
|
||||
// At the moment we support only full vector Load/Store and scalar Load/Store so that count is equal to lanes.
|
||||
// Then we are going to support variadic Load/Store with different element count
|
||||
const size_t count = m_generator->get_target_machine()->get_lanes();
|
||||
|
||||
ngraph::pass::Manager manager;
|
||||
manager.register_pass<snippets::pass::ConvertConstantsToScalars>();
|
||||
manager.register_pass<snippets::pass::ConvertPowerToPowerStatic>();
|
||||
manager.register_pass<snippets::pass::InsertLoad>();
|
||||
manager.register_pass<snippets::pass::InsertStore>();
|
||||
manager.register_pass<snippets::pass::InsertLoad>(count);
|
||||
manager.register_pass<snippets::pass::InsertStore>(count);
|
||||
manager.register_pass<snippets::pass::InsertMoveBroadcast>();
|
||||
manager.register_pass<snippets::pass::LoadMoveBroadcastToBroadcastLoad>();
|
||||
// Note that, BrodacastMove is typically inserted right after the Load. Such cases are typical for
|
||||
@ -246,12 +332,12 @@ void snippets::op::Subgraph::convert_to_snippet_dialect() {
|
||||
// Result
|
||||
// Note: Load* should be replaced with ScalarLoad in this example to avoid invalid read in vector Tile.
|
||||
if (!exec_domain.empty() && exec_domain.back() != 1) {
|
||||
manager.register_pass<snippets::pass::ReplaceLoadsWithScalarLoads>();
|
||||
manager.register_pass<snippets::pass::ReplaceStoresWithScalarStores>();
|
||||
manager.register_pass<snippets::pass::SetScalarCountForLoad>();
|
||||
manager.register_pass<snippets::pass::SetScalarCountForStore>();
|
||||
manager.get_pass_config()->
|
||||
set_callback<ngraph::snippets::pass::ReplaceLoadsWithScalarLoads>(skip_matching_domain);
|
||||
set_callback<ngraph::snippets::pass::SetScalarCountForLoad>(skip_matching_domain);
|
||||
manager.get_pass_config()->
|
||||
set_callback<ngraph::snippets::pass::ReplaceStoresWithScalarStores>(skip_matching_domain);
|
||||
set_callback<ngraph::snippets::pass::SetScalarCountForStore>(skip_matching_domain);
|
||||
}
|
||||
manager.run_passes(m_body);
|
||||
}
|
||||
|
@ -8,5 +8,8 @@
|
||||
using namespace std;
|
||||
using namespace ngraph;
|
||||
|
||||
snippets::op::Tile::Tile(const std::vector<std::pair<std::shared_ptr<snippets::Emitter>, snippets::RegInfo>>& nested) : Op(), region(nested) {
|
||||
snippets::op::Tile::Tile(const std::vector<AllocatedEmitter>& region, size_t increment,
|
||||
size_t num_inputs, size_t num_outputs,
|
||||
const std::vector<size_t>& io_dims, const std::vector<size_t>& io_data_sizes) :
|
||||
Op(), region(region), increment(increment), num_inputs(num_inputs), num_outputs(num_outputs), io_dims(io_dims), io_data_size(io_data_sizes) {
|
||||
}
|
||||
|
10
src/common/snippets/src/op/tile_scheduler.cpp
Normal file
10
src/common/snippets/src/op/tile_scheduler.cpp
Normal file
@ -0,0 +1,10 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/op/tile_scheduler.hpp"
|
||||
#include "snippets/generator.hpp"
|
||||
|
||||
ngraph::snippets::op::TileScheduler::TileScheduler(const AllocatedEmitter& vector_region, const AllocatedEmitter& scalar_region)
|
||||
: Op(), vector_region{vector_region}, scalar_region{scalar_region} {
|
||||
}
|
@ -1,10 +0,0 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/op/vectorload.hpp"
|
||||
|
||||
using namespace ngraph;
|
||||
|
||||
snippets::op::VectorLoad::VectorLoad(const Output<Node>& x) : Load(x) {
|
||||
}
|
@ -1,10 +0,0 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/op/vectorstore.hpp"
|
||||
|
||||
using namespace ngraph;
|
||||
|
||||
snippets::op::VectorStore::VectorStore(const Output<Node>& x) : Store(x) {
|
||||
}
|
@ -16,7 +16,6 @@
|
||||
bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr<ov::Model>& f) {
|
||||
RUN_ON_MODEL_SCOPE(AssignRegisters);
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::AssignRegisters")
|
||||
int reg64_tmp_start { 8 }; // R8, R9, R10, R11, R12, R13, R14, R15 inputs+outputs+1
|
||||
using Reg = size_t;
|
||||
auto ops = f->get_ordered_ops();
|
||||
decltype(ops) stmts;
|
||||
@ -26,8 +25,8 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
|
||||
|
||||
size_t rdx = 0;
|
||||
std::map<std::shared_ptr<descriptor::Tensor>, Reg> regs;
|
||||
for (auto op : stmts) {
|
||||
for (auto output : op->outputs()) {
|
||||
for (const auto& op : stmts) {
|
||||
for (const auto& output : op->outputs()) {
|
||||
regs[output.get_tensor_ptr()] = rdx++;
|
||||
}
|
||||
}
|
||||
@ -35,9 +34,9 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
|
||||
std::vector<std::set<Reg>> used;
|
||||
std::vector<std::set<Reg>> def;
|
||||
|
||||
for (auto op : stmts) {
|
||||
for (const auto& op : stmts) {
|
||||
std::set<Reg> u;
|
||||
for (auto input : op->inputs()) {
|
||||
for (const auto& input : op->inputs()) {
|
||||
if (regs.count(input.get_tensor_ptr())) {
|
||||
u.insert(regs[input.get_tensor_ptr()]);
|
||||
}
|
||||
@ -46,7 +45,7 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
|
||||
|
||||
std::set<Reg> d;
|
||||
if (!std::dynamic_pointer_cast<snippets::op::Store>(op)) {
|
||||
for (auto output : op->outputs()) {
|
||||
for (const auto& output : op->outputs()) {
|
||||
d.insert(regs[output.get_tensor_ptr()]);
|
||||
}
|
||||
}
|
||||
@ -65,8 +64,8 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
|
||||
for (size_t n = 0; n < stmts.size(); n++) {
|
||||
auto node = stmts[n];
|
||||
if (!std::dynamic_pointer_cast<snippets::op::Store>(node)) {
|
||||
for (auto out : node->outputs()) {
|
||||
for (auto port : out.get_target_inputs()) {
|
||||
for (const auto& out : node->outputs()) {
|
||||
for (const auto& port : out.get_target_inputs()) {
|
||||
auto pos = std::find(stmts.begin(), stmts.end(), port.get_node()->shared_from_this());
|
||||
if (pos != stmts.end()) {
|
||||
auto k = pos-stmts.begin();
|
||||
@ -136,46 +135,32 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
|
||||
|
||||
std::map<std::shared_ptr<descriptor::Tensor>, Reg> physical_regs;
|
||||
|
||||
for (auto reg : regs) {
|
||||
for (const auto& reg : regs) {
|
||||
physical_regs[reg.first] = register_map[reg.second];
|
||||
}
|
||||
|
||||
size_t constantID = 0;
|
||||
|
||||
for (auto n : f->get_ordered_ops()) {
|
||||
const auto num_parameters = f->get_parameters().size();
|
||||
for (const auto& n : f->get_ordered_ops()) {
|
||||
auto& rt = n->get_rt_info();
|
||||
// nothing to do for model signature
|
||||
if (std::dynamic_pointer_cast<opset1::Parameter>(n) || std::dynamic_pointer_cast<opset1::Result>(n)) {
|
||||
std::vector<size_t> regs;
|
||||
regs.reserve(n->outputs().size());
|
||||
/* The main idea here is that each operation stores its output regs in rt["reginfo"]. Input and output regs are
|
||||
* then derived by parsing node's and parent's rt["reginfo"], look into ngraph::snippets::getRegisters for details.
|
||||
* Note also that Parameter and Result store general-purpose register index, because they work with memory
|
||||
* (memory pointer is stored in gpr). All other "regular" ops store vector regs indexes, since calculations are
|
||||
* performed on registers.
|
||||
*/
|
||||
if (is_type<ov::op::v0::Result>(n)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// store only effective address
|
||||
if (auto result = std::dynamic_pointer_cast<snippets::op::Store>(n)) {
|
||||
auto ea = reg64_tmp_start+static_cast<int64_t>(f->get_result_index(result) + f->get_parameters().size());
|
||||
rt["effectiveAddress"] = ea;
|
||||
continue;
|
||||
}
|
||||
// store effective address and procced with vector registers
|
||||
if (ov::as_type_ptr<ngraph::snippets::op::Load>(n) || ov::as_type_ptr<ngraph::snippets::op::BroadcastLoad>(n)) {
|
||||
auto source = n->get_input_source_output(0).get_node_shared_ptr();
|
||||
|
||||
if (auto param = ov::as_type_ptr<opset1::Parameter>(source)) {
|
||||
auto ea = reg64_tmp_start+static_cast<int64_t>(f->get_parameter_index(param));
|
||||
rt["effectiveAddress"] = ea;
|
||||
} else if (auto constant = ov::as_type_ptr<opset1::Constant>(source)) {
|
||||
auto ea = reg64_tmp_start+static_cast<int64_t>(f->get_parameters().size() + f->get_results().size() + 1 + constantID);
|
||||
rt["effectiveAddress"] = ea;
|
||||
constantID++;
|
||||
} else {
|
||||
throw ngraph_error("load/broadcast should follow only Parameter or non-Scalar constant");
|
||||
} else if (const auto& param = ov::as_type_ptr<ov::op::v0::Parameter>(n)) {
|
||||
regs.push_back(f->get_parameter_index(param));
|
||||
} else if (const auto& store = ov::as_type_ptr<ngraph::snippets::op::Store>(n)) {
|
||||
regs.push_back(f->get_result_index(store) + num_parameters);
|
||||
} else {
|
||||
for (const auto& output : n->outputs()) {
|
||||
auto allocated = physical_regs[output.get_tensor_ptr()];
|
||||
regs.push_back(allocated);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<size_t> regs; regs.reserve(n->outputs().size());
|
||||
for (auto output : n->outputs()) {
|
||||
auto allocated = physical_regs[output.get_tensor_ptr()];
|
||||
regs.push_back(allocated);
|
||||
}
|
||||
rt["reginfo"] = regs;
|
||||
}
|
||||
|
||||
|
@ -99,15 +99,17 @@ auto is_layout_oblivious(const std::shared_ptr<const Node> &n) -> bool {
|
||||
|| ov::is_type<opset1::Tanh>(n)
|
||||
|| ov::is_type<ngraph::op::v0::Gelu>(n)
|
||||
|| ov::is_type<ngraph::op::v7::Gelu>(n)
|
||||
|| ov::is_type<ngraph::op::v4::HSwish>(n);
|
||||
|| ov::is_type<ngraph::op::v4::HSwish>(n)
|
||||
|| ov::is_type<ngraph::op::v0::Convert>(n);
|
||||
};
|
||||
return is_layout_oblivious_unary(n) || is_layout_oblivious_binary(n);
|
||||
}
|
||||
|
||||
auto has_supported_in_out(const std::shared_ptr<const Node> &n) -> bool {
|
||||
auto supported = [](descriptor::Tensor& t) -> bool {
|
||||
return t.get_element_type() == ngraph::element::f32 &&
|
||||
t.get_partial_shape().is_static();
|
||||
static const std::set<ngraph::element::Type> supported_data_types =
|
||||
{ ngraph::element::f32, ngraph::element::i32, ngraph::element::bf16, ngraph::element::i8, ngraph::element::u8 };
|
||||
return t.get_partial_shape().is_static() && supported_data_types.count(t.get_element_type()) != 0;
|
||||
};
|
||||
const auto & inputs = n->inputs();
|
||||
const auto & outputs = n->outputs();
|
||||
@ -148,19 +150,9 @@ auto update_out_tensor_name(std::shared_ptr<ngraph::snippets::op::Subgraph> &sub
|
||||
for (unsigned int i = 0; i < subgraph->get_output_size() && not_set; i++) {
|
||||
for (const auto &in : subgraph->get_output_target_inputs(i)) {
|
||||
if (ov::is_type<opset1::Result>(in.get_node())) {
|
||||
auto out_tensor = subgraph->output(i).get_tensor_ptr();
|
||||
NGRAPH_SUPPRESS_DEPRECATED_START
|
||||
if (out_tensor->get_name().empty()) {
|
||||
const auto& body_result = subgraph->get_body()->get_output_op(i);
|
||||
const auto& body_result_input = body_result->get_input_source_output(0);
|
||||
// Note that create_ie_output_name() checks only deprecated output.get_tensor().get_name()
|
||||
// However output.get_tensor().get_names() should also be updated
|
||||
if (!body_result_input.get_names().empty())
|
||||
out_tensor->add_names(body_result_input.get_names());
|
||||
std::string newTensorName = ngraph::op::util::get_ie_output_name(body_result_input);
|
||||
out_tensor->set_name(newTensorName);
|
||||
}
|
||||
NGRAPH_SUPPRESS_DEPRECATED_END
|
||||
const auto& body_result = subgraph->get_body()->get_output_op(i);
|
||||
const auto& body_result_input = body_result->get_input_source_output(0);
|
||||
op::Subgraph::fill_empty_output_names(subgraph->output(i), body_result_input);
|
||||
not_set = false;
|
||||
break;
|
||||
}
|
||||
@ -406,6 +398,40 @@ TokenizeSnippets::TokenizeSnippets() {
|
||||
auto& input_body = clones[input_node];
|
||||
size_t source_output_index = input_value.get_index();
|
||||
auto source_result = input_body->get_results()[source_output_index];
|
||||
|
||||
// We cannot add new node, that is not Convert, after Convert (that is start node) to avoid arithmetic problems with conversion
|
||||
// We can add any new node in Subgraph after Convert (bacause after Input)
|
||||
// Parameter
|
||||
// |
|
||||
// Convert
|
||||
//
|
||||
// We cannot add new node, that isn't Convert, in Subgraph after existing Convert
|
||||
// Parameter
|
||||
// Relu
|
||||
// Convert
|
||||
//
|
||||
// But we can add new Convert in Subgraph after existing Convert
|
||||
// Parameter
|
||||
// Relu
|
||||
// Convert
|
||||
// Convert
|
||||
//
|
||||
// Thus, We can grow subgraph only if Convert is the first node of subgraph and have to abort it's the last one and we want to add not Convert
|
||||
// We have this limitation because at the moment we support only one execution precision inside body, so
|
||||
// if there is Convert with input and output data types that aren't equal to supported exec type,
|
||||
// we can get conversion math errors
|
||||
const auto output_of_subgraph = source_result->get_input_node_shared_ptr(0);
|
||||
if (!ov::is_type<ngraph::op::v0::Convert>(node) && ov::is_type<ngraph::op::v0::Convert>(output_of_subgraph)) {
|
||||
// Also we can add new node after < Parameter -> Convert -> Convert -> Convert >
|
||||
auto grandparent = output_of_subgraph->get_input_node_ptr(0);
|
||||
while (ov::is_type<ngraph::op::v0::Convert>(grandparent)) {
|
||||
grandparent = grandparent->get_input_node_ptr(0);
|
||||
}
|
||||
|
||||
if (!ov::is_type<ngraph::op::v0::Parameter>(grandparent)) {
|
||||
return abort_with_strategy("Convert supports only as Input and as Result of subgraph. Aborting");
|
||||
}
|
||||
}
|
||||
// Result op has a single input
|
||||
internal_inputs.push_back(source_result->input_value(0));
|
||||
} else {
|
||||
@ -477,7 +503,7 @@ TokenizeSnippets::TokenizeSnippets() {
|
||||
throw ngraph_error("body results and node results size mismatch during subgraph collaps");
|
||||
}
|
||||
// todo: move this plugin-specific constraint to the plugin callback
|
||||
if (body_parameters.size() + body_results.size() > 7) {
|
||||
if (body_parameters.size() + body_results.size() > 12) {
|
||||
const std::string message_reset = "new subgraph is created. Impossible to schedule subgraph with " +
|
||||
std::to_string(body_parameters.size()) + " inputs and " + std::to_string(body_results.size()) + " outputs.";
|
||||
const std::string message_abort = "failed to continue subgraph. Impossible to schedule subgraph with " +
|
||||
|
72
src/common/snippets/src/pass/insert_convert_on_inputs.cpp
Normal file
72
src/common/snippets/src/pass/insert_convert_on_inputs.cpp
Normal file
@ -0,0 +1,72 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <snippets/itt.hpp>
|
||||
#include "snippets/remarks.hpp"
|
||||
|
||||
#include "snippets/pass/insert_convert_on_inputs.hpp"
|
||||
#include "snippets/snippets_isa.hpp"
|
||||
|
||||
#include "ngraph/type.hpp"
|
||||
#include "ngraph/node.hpp"
|
||||
|
||||
#include <ngraph/opsets/opset1.hpp>
|
||||
#include <ngraph/rt_info.hpp>
|
||||
#include <ngraph/pattern/op/wrap_type.hpp>
|
||||
#include <ngraph/pattern/op/or.hpp>
|
||||
|
||||
// We should recursivelly (after full sequences of ConvertTruncation) go through inputs and
|
||||
// insert ConvertSaturation with supported element type before eltwises
|
||||
// NOTE: JUST EXAMPLE:
|
||||
// Parameter I8
|
||||
// ConvertTruncation U8
|
||||
// / | \
|
||||
// ConvertTruncation F32 ConvertTruncation I32 ConvertTruncation BF16
|
||||
// Eltwise ConvertSaturation FP32 ConvertTruncation I32
|
||||
// <> Eltwise ConvertSaturation FP32
|
||||
// <> Eltwise
|
||||
bool insertConvertSaturationAfterNode(const std::shared_ptr<ov::Node>& node, const ov::element::Type element_type) {
|
||||
bool rewritten = false;
|
||||
for (const auto& output : node->outputs()) {
|
||||
for (auto consumer : output.get_target_inputs()) {
|
||||
const auto output_shared_node = consumer.get_node()->shared_from_this();
|
||||
// Go down through ConvertTruncation sequence
|
||||
if (auto existing_convert_t = ov::as_type_ptr<ngraph::snippets::op::ConvertTruncation>(output_shared_node)) {
|
||||
rewritten = insertConvertSaturationAfterNode(existing_convert_t, element_type);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if ConvertSaturation already exists with supported element type or not and insert ConvertSaturation with supported element type
|
||||
auto existing_convert_s = ov::as_type_ptr<ngraph::snippets::op::ConvertSaturation>(output_shared_node);
|
||||
if ((!existing_convert_s && !ov::is_type<ov::op::v0::Result>(output_shared_node) && consumer.get_element_type() != element_type) ||
|
||||
(existing_convert_s && existing_convert_s->get_destination_type() != element_type)) {
|
||||
const auto convert = std::make_shared<ngraph::snippets::op::ConvertSaturation>(node, element_type);
|
||||
consumer.replace_source_output(convert);
|
||||
rewritten |= true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return rewritten;
|
||||
}
|
||||
|
||||
ngraph::snippets::pass::InsertConvertOnInputs::InsertConvertOnInputs(const ov::element::Type exec_type) {
|
||||
MATCHER_SCOPE(InsertConvertOnInputs);
|
||||
|
||||
auto param_pattern = ngraph::pattern::wrap_type<ngraph::opset1::Parameter>();
|
||||
auto scalar_pattern = pattern::wrap_type<opset1::Constant>(
|
||||
[=](Output<Node> output) -> bool { return ngraph::shape_size(output.get_shape()) == 1; });
|
||||
auto input = std::make_shared<pattern::op::Or>(OutputVector{ param_pattern, scalar_pattern });
|
||||
|
||||
ngraph::matcher_pass_callback callback = [this, exec_type](ngraph::pattern::Matcher& m) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertConvertOnInputs")
|
||||
auto root = m.get_match_root();
|
||||
|
||||
auto rewritten = insertConvertSaturationAfterNode(root, exec_type);
|
||||
|
||||
return rewritten;
|
||||
};
|
||||
|
||||
auto m = std::make_shared<ngraph::pattern::Matcher>(input, matcher_name);
|
||||
register_matcher(m, callback);
|
||||
}
|
@ -12,11 +12,11 @@
|
||||
#include <ngraph/rt_info.hpp>
|
||||
#include <ngraph/pattern/op/wrap_type.hpp>
|
||||
|
||||
ngraph::snippets::pass::InsertLoad::InsertLoad() {
|
||||
ngraph::snippets::pass::InsertLoad::InsertLoad(const size_t count) {
|
||||
MATCHER_SCOPE(InsertLoad);
|
||||
register_matcher(std::make_shared<ngraph::pattern::Matcher>(
|
||||
ngraph::pattern::wrap_type<ngraph::opset1::Parameter>(), matcher_name),
|
||||
[this](ngraph::pattern::Matcher &m) {
|
||||
[this, count](ngraph::pattern::Matcher &m) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertLoad")
|
||||
auto root = m.get_match_root();
|
||||
|
||||
@ -29,7 +29,7 @@ ngraph::snippets::pass::InsertLoad::InsertLoad() {
|
||||
}
|
||||
}
|
||||
|
||||
auto load = std::make_shared<ngraph::snippets::op::Load> (root);
|
||||
auto load = std::make_shared<ngraph::snippets::op::Load>(root, count);
|
||||
ngraph::copy_runtime_info(root, load);
|
||||
|
||||
bool rewritten = false;
|
||||
@ -46,11 +46,11 @@ ngraph::snippets::pass::InsertLoad::InsertLoad() {
|
||||
});
|
||||
}
|
||||
|
||||
ngraph::snippets::pass::InsertStore::InsertStore() {
|
||||
ngraph::snippets::pass::InsertStore::InsertStore(const size_t count) {
|
||||
MATCHER_SCOPE(InsertStore);
|
||||
register_matcher(std::make_shared<ngraph::pattern::Matcher>(
|
||||
ngraph::pattern::wrap_type<ngraph::opset1::Result>(), matcher_name),
|
||||
[this](ngraph::pattern::Matcher &m) {
|
||||
[this, count](ngraph::pattern::Matcher &m) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertStore")
|
||||
auto root = m.get_match_root();
|
||||
|
||||
@ -61,7 +61,7 @@ ngraph::snippets::pass::InsertStore::InsertStore() {
|
||||
}
|
||||
}
|
||||
|
||||
auto store = std::make_shared<ngraph::snippets::op::Store> (root->input_value(0));
|
||||
auto store = std::make_shared<ngraph::snippets::op::Store> (root->input_value(0), count);
|
||||
ngraph::copy_runtime_info(root, store);
|
||||
root->set_argument(0, store);
|
||||
return true;
|
||||
|
@ -15,7 +15,7 @@
|
||||
ngraph::snippets::pass::LoadMoveBroadcastToBroadcastLoad::LoadMoveBroadcastToBroadcastLoad() {
|
||||
MATCHER_SCOPE(LoadMoveBroadcastToBroadcastLoad);
|
||||
auto param_pattern = ngraph::pattern::wrap_type<ngraph::opset1::Parameter>();
|
||||
auto load_pattern = std::make_shared<ngraph::snippets::op::Load>(param_pattern);
|
||||
auto load_pattern = ngraph::pattern::wrap_type<ngraph::snippets::op::Load>({param_pattern});
|
||||
auto fbn = std::make_shared<ngraph::snippets::op::BroadcastMove>(load_pattern, Shape{1});
|
||||
|
||||
register_matcher(std::make_shared<ngraph::pattern::Matcher>(fbn, matcher_name),
|
||||
|
@ -0,0 +1,31 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <snippets/itt.hpp>
|
||||
|
||||
#include "snippets/op/convert_saturation.hpp"
|
||||
#include "snippets/pass/reset_type_relaxed_node_precision.hpp"
|
||||
#include "ngraph_ops/type_relaxed.hpp"
|
||||
|
||||
#include <ngraph/rt_info.hpp>
|
||||
|
||||
|
||||
ngraph::snippets::pass::ResetTypeRelaxedNodePrecision::ResetTypeRelaxedNodePrecision(const ov::element::Type exec_type) : exec_type(exec_type) { }
|
||||
|
||||
bool ngraph::snippets::pass::ResetTypeRelaxedNodePrecision::run_on_model(const std::shared_ptr<ov::Model> &m) {
|
||||
RUN_ON_FUNCTION_SCOPE(ResetTypeRelaxedNodePrecision);
|
||||
bool rewritten = false;
|
||||
for (auto& op : m->get_ordered_ops()) {
|
||||
if (auto node = std::dynamic_pointer_cast<ngraph::op::TypeRelaxedBase>(op)) {
|
||||
for (int i = 0; i < op->outputs().size(); i++) {
|
||||
node->set_overridden_output_type(exec_type, i);
|
||||
rewritten |= true;
|
||||
}
|
||||
} else {
|
||||
op->validate_and_infer_types();
|
||||
}
|
||||
}
|
||||
|
||||
return rewritten;
|
||||
}
|
@ -0,0 +1,34 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/remarks.hpp"
|
||||
#include <snippets/itt.hpp>
|
||||
|
||||
#include "snippets/pass/transform_convert_to_truncation.hpp"
|
||||
#include "snippets/snippets_isa.hpp"
|
||||
|
||||
#include <ngraph/opsets/opset1.hpp>
|
||||
#include <ngraph/rt_info.hpp>
|
||||
#include <ngraph/pattern/op/wrap_type.hpp>
|
||||
|
||||
ngraph::snippets::pass::TransformConvertToConvertTruncation::TransformConvertToConvertTruncation() {
|
||||
MATCHER_SCOPE(TransformConvertToConvertTruncation);
|
||||
register_matcher(std::make_shared<ngraph::pattern::Matcher>(
|
||||
ngraph::pattern::wrap_type<ngraph::opset1::Convert>()),
|
||||
[this](ngraph::pattern::Matcher &m) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::TransformConvertToConvertTruncation")
|
||||
const auto root = m.get_match_root();
|
||||
const auto convert = ngraph::as_type_ptr<ngraph::opset1::Convert>(root);
|
||||
if (!convert)
|
||||
return false;
|
||||
|
||||
auto convert_truncation = std::make_shared<op::ConvertTruncation>(convert->get_input_source_output(0),
|
||||
convert->get_destination_type());
|
||||
convert_truncation->set_friendly_name(convert->get_friendly_name());
|
||||
ngraph::copy_runtime_info(convert, convert_truncation);
|
||||
ngraph::replace_node(convert, convert_truncation);
|
||||
|
||||
return true;
|
||||
});
|
||||
}
|
@ -7,40 +7,43 @@
|
||||
#include "snippets/pass/vector_to_scalar.hpp"
|
||||
#include "snippets/snippets_isa.hpp"
|
||||
|
||||
#include <ngraph/opsets/opset1.hpp>
|
||||
#include <ngraph/rt_info.hpp>
|
||||
#include <ngraph/pattern/op/wrap_type.hpp>
|
||||
|
||||
ngraph::snippets::pass::ReplaceLoadsWithScalarLoads::ReplaceLoadsWithScalarLoads() {
|
||||
MATCHER_SCOPE(ReplaceLoadsWithScalarLoads);
|
||||
ngraph::snippets::pass::SetScalarCountForLoad::SetScalarCountForLoad() {
|
||||
MATCHER_SCOPE(SetScalarCountForLoad);
|
||||
register_matcher(std::make_shared<ngraph::pattern::Matcher>(
|
||||
ngraph::pattern::wrap_type<ngraph::snippets::op::Load>(), matcher_name),
|
||||
[this](ngraph::pattern::Matcher &m) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ReplaceLoadsWithScalarLoads_callback")
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::SetScalarCountForLoad_callback")
|
||||
auto root = m.get_match_root();
|
||||
if (transformation_callback(root))
|
||||
return false;
|
||||
auto load = std::make_shared<ngraph::snippets::op::ScalarLoad> (root->input_value(0));
|
||||
load->set_friendly_name(root->get_friendly_name());
|
||||
ngraph::copy_runtime_info(root, load);
|
||||
ngraph::replace_node(root, load);
|
||||
|
||||
const auto load = ov::as_type_ptr<ngraph::snippets::op::Load>(root);
|
||||
if (!load)
|
||||
return false;
|
||||
|
||||
load->set_count(1lu);
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
||||
ngraph::snippets::pass::ReplaceStoresWithScalarStores::ReplaceStoresWithScalarStores() {
|
||||
MATCHER_SCOPE(ReplaceStoresWithScalarStores);
|
||||
ngraph::snippets::pass::SetScalarCountForStore::SetScalarCountForStore() {
|
||||
MATCHER_SCOPE(SetScalarCountForStore);
|
||||
register_matcher(std::make_shared<ngraph::pattern::Matcher>(
|
||||
ngraph::pattern::wrap_type<ngraph::snippets::op::Store>(), matcher_name),
|
||||
[this](ngraph::pattern::Matcher &m) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ReplaceStoresWithScalarStores_callback")
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::SetScalarCountForStore_callback")
|
||||
auto root = m.get_match_root();
|
||||
if (transformation_callback(root))
|
||||
return false;
|
||||
auto store = std::make_shared<ngraph::snippets::op::ScalarStore> (root->input_value(0));
|
||||
store->set_friendly_name(root->get_friendly_name());
|
||||
ngraph::copy_runtime_info(root, store);
|
||||
ngraph::replace_node(root, store);
|
||||
|
||||
const auto store = ov::as_type_ptr<ngraph::snippets::op::Store>(root);
|
||||
if (!store)
|
||||
return false;
|
||||
|
||||
store->set_count(1lu);
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
@ -29,7 +29,7 @@ public:
|
||||
DummyTargetMachine();
|
||||
bool is_supported() const override { return true; }
|
||||
ngraph::snippets::code get_snippet() const override { return nullptr; }
|
||||
size_t get_lanes() const override { return 1; }
|
||||
size_t get_lanes() const override { return 10; }
|
||||
};
|
||||
|
||||
class DummyGenerator : public ngraph::snippets::Generator {
|
||||
|
@ -0,0 +1,40 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "lowering_utils.hpp"
|
||||
#include "snippets_helpers.hpp"
|
||||
|
||||
/* The main purpose is to test that:
|
||||
* - Load/Store ops are inserted
|
||||
* - Load + BroadcastMove fuses to BroadcastLoad (not the main focus, but still had to cover; overlays with insert_movebroadcast.cpp)
|
||||
* - Proper Load/Stores are converted to scalar form to avoid invalid memory access by vector tile
|
||||
* (temporary disabled, since corresponding PR is not merged yet)
|
||||
*/
|
||||
|
||||
namespace ov {
|
||||
namespace test {
|
||||
namespace snippets {
|
||||
|
||||
typedef std::tuple<
|
||||
Shape, // Input shape 0
|
||||
Shape, // Input shape 1
|
||||
Shape, // Input shape 2
|
||||
Shape, // Broadcast shape 0
|
||||
Shape, // Broadcast shape 1
|
||||
Shape // Broadcast shape 2
|
||||
> insertLoadStoreParams;
|
||||
|
||||
class InsertLoadStoreTests : public LoweringTests, public testing::WithParamInterface<insertLoadStoreParams> {
|
||||
public:
|
||||
static std::string getTestCaseName(testing::TestParamInfo<insertLoadStoreParams> obj);
|
||||
protected:
|
||||
void SetUp() override;
|
||||
std::shared_ptr<SnippetsFunctionBase> snippets_function;
|
||||
};
|
||||
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
@ -23,18 +23,15 @@ DummyTargetMachine::DummyTargetMachine() {
|
||||
jitters[op::v1::Multiply::get_type_info_static()] = dummy_functor;
|
||||
jitters[op::v1::Multiply::get_type_info_static()] = dummy_functor;
|
||||
jitters[ngraph::snippets::op::Load::get_type_info_static()] = dummy_functor;
|
||||
jitters[ngraph::snippets::op::VectorLoad::get_type_info_static()] = dummy_functor;
|
||||
jitters[ngraph::snippets::op::ScalarLoad::get_type_info_static()] = dummy_functor;
|
||||
jitters[ngraph::snippets::op::BroadcastLoad::get_type_info_static()] = dummy_functor;
|
||||
|
||||
jitters[ngraph::snippets::op::Store::get_type_info_static()] = dummy_functor;
|
||||
jitters[ngraph::snippets::op::VectorStore::get_type_info_static()] = dummy_functor;
|
||||
jitters[ngraph::snippets::op::ScalarStore::get_type_info_static()] = dummy_functor;
|
||||
|
||||
jitters[ngraph::snippets::op::Scalar::get_type_info_static()] = dummy_functor;
|
||||
jitters[ngraph::snippets::op::BroadcastMove::get_type_info_static()] = dummy_functor;
|
||||
jitters[ngraph::snippets::op::Kernel::get_type_info_static()] = dummy_functor;
|
||||
jitters[ngraph::snippets::op::Tile::get_type_info_static()] = dummy_functor;
|
||||
jitters[ngraph::snippets::op::TileScheduler::get_type_info_static()] = dummy_functor;
|
||||
}
|
||||
|
||||
std::shared_ptr<ngraph::snippets::op::Subgraph> LoweringTests::getSubgraph(const std::shared_ptr<Model>& f) {
|
||||
|
@ -49,6 +49,7 @@ TEST_P(CanonicalizationTests, Add) {
|
||||
function = snippets_function->getOriginal();
|
||||
function_ref = snippets_function->getReference();
|
||||
auto subgraph = getTokenizedSubgraph(function);
|
||||
subgraph->set_generator(std::make_shared<DummyGenerator>());
|
||||
Shape canonical_output_shape = subgraph->canonicalize(output_blocked_shapes, input_blocked_shapes);
|
||||
ASSERT_DIMS_EQ(canonical_output_shape, expected_output_shape);
|
||||
}
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include <pass/collapse_subgraph.hpp>
|
||||
#include <subgraph_simple.hpp>
|
||||
#include <subgraph_converts.hpp>
|
||||
#include "snippets/pass/collapse_subgraph.hpp"
|
||||
|
||||
namespace ov {
|
||||
@ -39,6 +40,43 @@ TEST_F(CollapseSubgraphTests, smoke_Snippets_AvoidLoopEltwise) {
|
||||
run();
|
||||
}
|
||||
|
||||
TEST_F(CollapseSubgraphTests, smoke_Snippets_OneConvert) {
|
||||
const auto &f = ConvertFunction(std::vector<Shape>{{2, 5}});
|
||||
function = f.getOriginal();
|
||||
function_ref = f.getReference();
|
||||
run();
|
||||
}
|
||||
|
||||
TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertInput) {
|
||||
const auto &f = ConvertInputFunction(std::vector<Shape>{{2, 5}, {1, 5}});
|
||||
function = f.getOriginal();
|
||||
function_ref = f.getReference();
|
||||
run();
|
||||
}
|
||||
|
||||
TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertOutput) {
|
||||
const auto &f = ConvertOutputFunction(std::vector<Shape>{{2, 5}, {1, 5}});
|
||||
function = f.getOriginal();
|
||||
function_ref = f.getReference();
|
||||
run();
|
||||
}
|
||||
|
||||
TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertStub) {
|
||||
const auto &f = ConvertStubFunction(std::vector<Shape>{{2, 5, 2}, {1, 5, 1}});
|
||||
function = f.getOriginal();
|
||||
function_ref = f.getReference();
|
||||
run();
|
||||
}
|
||||
|
||||
TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertPartialInputsAndResults) {
|
||||
const auto &f = ConvertPartialInputsAndResultsFunction(std::vector<Shape>{{2, 5, 1}, {1, 5, 1}, {2, 1, 10}},
|
||||
std::vector<ov::element::Type>{ov::element::i8, ov::element::bf16, ov::element::f32},
|
||||
std::vector<ov::element::Type>{ov::element::f32, ov::element::i8});
|
||||
function = f.getOriginal();
|
||||
function_ref = f.getReference();
|
||||
run();
|
||||
}
|
||||
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
@ -19,56 +19,81 @@ using namespace ngraph;
|
||||
|
||||
// todo: Rewrite this test using Snippets test infrastructure. See ./include/canonicalization.hpp for example
|
||||
|
||||
TEST(TransformationTests, ReplaceLoadsWithScalarLoads) {
|
||||
std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
|
||||
{
|
||||
auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 2});
|
||||
auto load = std::make_shared<snippets::isa::Load>(data);
|
||||
auto neg = std::make_shared<opset1::Negative>(load);
|
||||
auto store = std::make_shared<snippets::isa::Store>(neg);
|
||||
f = std::make_shared<Function>(NodeVector{store}, ParameterVector{data});
|
||||
|
||||
pass::Manager m;
|
||||
m.register_pass<pass::InitNodeInfo>();
|
||||
m.register_pass<snippets::pass::ReplaceLoadsWithScalarLoads>();
|
||||
m.run_passes(f);
|
||||
ASSERT_NO_THROW(check_rt_info(f));
|
||||
template<typename T>
|
||||
size_t get_count(const std::shared_ptr<Function>& f, const std::string& name) {
|
||||
size_t load_count = std::numeric_limits<size_t>::max();
|
||||
for (auto op : f->get_ops()) {
|
||||
if (op->get_friendly_name() == name) {
|
||||
load_count = ov::as_type_ptr<T>(op)->get_count();
|
||||
}
|
||||
}
|
||||
{
|
||||
auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 2});
|
||||
auto load = std::make_shared<snippets::isa::ScalarLoad>(data);
|
||||
auto neg = std::make_shared<opset1::Negative>(load);
|
||||
auto store = std::make_shared<snippets::isa::Store>(neg);
|
||||
f_ref = std::make_shared<Function>(NodeVector{store}, ParameterVector{data});
|
||||
}
|
||||
|
||||
auto res = compare_functions(f, f_ref);
|
||||
ASSERT_TRUE(res.first) << res.second;
|
||||
return load_count;
|
||||
}
|
||||
|
||||
TEST(TransformationTests, ReplaceStoresWithScalarStores) {
|
||||
TEST(TransformationTests, SetScalarCountForLoad) {
|
||||
std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
|
||||
const auto count = 16;
|
||||
{
|
||||
auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 2});
|
||||
auto load = std::make_shared<snippets::isa::Load>(data);
|
||||
auto load = std::make_shared<snippets::isa::Load>(data, count);
|
||||
load->set_friendly_name("load");
|
||||
auto neg = std::make_shared<opset1::Negative>(load);
|
||||
auto store = std::make_shared<snippets::isa::Store>(neg);
|
||||
auto store = std::make_shared<snippets::isa::Store>(neg, count);
|
||||
f = std::make_shared<Function>(NodeVector{store}, ParameterVector{data});
|
||||
|
||||
pass::Manager m;
|
||||
m.register_pass<pass::InitNodeInfo>();
|
||||
m.register_pass<snippets::pass::ReplaceStoresWithScalarStores>();
|
||||
m.register_pass<snippets::pass::SetScalarCountForLoad>();
|
||||
m.run_passes(f);
|
||||
ASSERT_NO_THROW(check_rt_info(f));
|
||||
}
|
||||
{
|
||||
auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 2});
|
||||
auto load = std::make_shared<snippets::isa::Load>(data);
|
||||
auto load = std::make_shared<snippets::isa::Load>(data, 1lu);
|
||||
load->set_friendly_name("load_ref");
|
||||
auto neg = std::make_shared<opset1::Negative>(load);
|
||||
auto store = std::make_shared<snippets::isa::ScalarStore>(neg);
|
||||
auto store = std::make_shared<snippets::isa::Store>(neg, count);
|
||||
f_ref = std::make_shared<Function>(NodeVector{store}, ParameterVector{data});
|
||||
}
|
||||
|
||||
auto res = compare_functions(f, f_ref);
|
||||
ASSERT_TRUE(res.first) << res.second;
|
||||
|
||||
auto load_count = get_count<ngraph::snippets::op::Load>(f, "load");
|
||||
auto load_count_ref = get_count<ngraph::snippets::op::Load>(f_ref, "load_ref");
|
||||
ASSERT_EQ(load_count, load_count_ref);
|
||||
}
|
||||
|
||||
TEST(TransformationTests, SetScalarCountForStore) {
|
||||
std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
|
||||
const auto count = 16;
|
||||
{
|
||||
auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 2});
|
||||
auto load = std::make_shared<snippets::isa::Load>(data, count);
|
||||
auto neg = std::make_shared<opset1::Negative>(load);
|
||||
auto store = std::make_shared<snippets::isa::Store>(neg, count);
|
||||
store->set_friendly_name("store");
|
||||
f = std::make_shared<Function>(NodeVector{store}, ParameterVector{data});
|
||||
|
||||
pass::Manager m;
|
||||
m.register_pass<pass::InitNodeInfo>();
|
||||
m.register_pass<snippets::pass::SetScalarCountForStore>();
|
||||
m.run_passes(f);
|
||||
ASSERT_NO_THROW(check_rt_info(f));
|
||||
}
|
||||
{
|
||||
auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 2});
|
||||
auto load = std::make_shared<snippets::isa::Load>(data, count);
|
||||
auto neg = std::make_shared<opset1::Negative>(load);
|
||||
auto store = std::make_shared<snippets::isa::Store>(neg, 1lu);
|
||||
store->set_friendly_name("store_ref");
|
||||
f_ref = std::make_shared<Function>(NodeVector{store}, ParameterVector{data});
|
||||
}
|
||||
|
||||
auto res = compare_functions(f, f_ref);
|
||||
ASSERT_TRUE(res.first) << res.second;
|
||||
|
||||
int64_t store_count = get_count<ngraph::snippets::op::Store>(f, "store");
|
||||
int64_t store_count_ref = get_count<ngraph::snippets::op::Store>(f_ref, "store_ref");
|
||||
ASSERT_EQ(store_count, store_count_ref);
|
||||
}
|
@ -25,12 +25,14 @@ TEST(TransformationTests, AssignRegisters) {
|
||||
{
|
||||
auto p0 = std::make_shared<opset1::Parameter>(element::f32, Shape(1));
|
||||
auto p1 = std::make_shared<opset1::Parameter>(element::f32, Shape(1));
|
||||
p0->set_friendly_name("p00");
|
||||
p1->set_friendly_name("p01");
|
||||
auto y00 = std::make_shared<snippets::isa::Load>(p0); y00->set_friendly_name("y00");
|
||||
auto y01 = std::make_shared<snippets::isa::Load>(p1); y01->set_friendly_name("y01");
|
||||
auto y02 = std::make_shared<opset1::Multiply>(y00, y01); y02->set_friendly_name("y02");
|
||||
auto y03 = std::make_shared<snippets::isa::Store>(y02); y03->set_friendly_name("y03");
|
||||
|
||||
f = std::make_shared<Function>(NodeVector{y03}, ParameterVector{p0, p1});
|
||||
auto s00 = std::make_shared<snippets::isa::Store>(y02); s00->set_friendly_name("y03");
|
||||
s00->set_friendly_name("s00");
|
||||
f = std::make_shared<Function>(NodeVector{s00}, ParameterVector{p0, p1});
|
||||
|
||||
pass::Manager m;
|
||||
m.register_pass<pass::InitNodeInfo>();
|
||||
@ -39,13 +41,17 @@ TEST(TransformationTests, AssignRegisters) {
|
||||
ASSERT_NO_THROW(check_rt_info(f));
|
||||
}
|
||||
|
||||
// instead of comparing to a reference function check that registers are correctly assigned
|
||||
// and stored to runtime info
|
||||
/* Instead of comparing to a reference function check that registers are correctly assigned and stored to runtime
|
||||
* info. Note that Parameters and Store rt_info contains gpr indexes, while general op's rt_info contain vector
|
||||
* indexes */
|
||||
{
|
||||
std::map<std::string, size_t> ref_registers {
|
||||
{"p00", 0}, // gpr
|
||||
{"p01", 1}, // gpr
|
||||
{"y00", 0},
|
||||
{"y01", 1},
|
||||
{"y02", 2}
|
||||
{"y02", 2},
|
||||
{"s00", 2}, // gpr
|
||||
};
|
||||
|
||||
auto total_ops = 0;
|
||||
@ -75,6 +81,14 @@ TEST(TransformationTests, AssignRegisters2) {
|
||||
auto p5 = std::make_shared<opset1::Parameter>(ngraph::element::f32, Shape());
|
||||
auto p6 = std::make_shared<opset1::Parameter>(ngraph::element::f32, Shape());
|
||||
auto p7 = std::make_shared<opset1::Parameter>(ngraph::element::f32, Shape());
|
||||
p0->set_friendly_name("p00");
|
||||
p1->set_friendly_name("p01");
|
||||
p2->set_friendly_name("p02");
|
||||
p3->set_friendly_name("p03");
|
||||
p4->set_friendly_name("p04");
|
||||
p5->set_friendly_name("p05");
|
||||
p6->set_friendly_name("p06");
|
||||
p7->set_friendly_name("p07");
|
||||
|
||||
auto c0 = std::make_shared<snippets::isa::Scalar>(ngraph::element::f32, Shape(), 3.14f); c0->set_friendly_name("r00");
|
||||
auto c1 = std::make_shared<snippets::isa::Scalar>(ngraph::element::f32, Shape(), 6.6260701e-34f); c1->set_friendly_name("r01");
|
||||
@ -102,9 +116,10 @@ TEST(TransformationTests, AssignRegisters2) {
|
||||
auto y20 = std::make_shared<opset1::Add>(y17, y18); y20->set_friendly_name("r22");
|
||||
auto y21 = std::make_shared<opset1::Add>(y15, y19); y21->set_friendly_name("r23");
|
||||
auto y22 = std::make_shared<opset1::Add>(y20, y21); y22->set_friendly_name("r24");
|
||||
auto y23 = std::make_shared<snippets::isa::Store>(y22);
|
||||
auto s00 = std::make_shared<snippets::isa::Store>(y22);
|
||||
s00->set_friendly_name("s00");
|
||||
|
||||
f = std::make_shared<Function>(NodeVector{y23}, ParameterVector{p0, p1, p2, p3, p4, p5, p6, p7});
|
||||
f = std::make_shared<Function>(NodeVector{s00}, ParameterVector{p0, p1, p2, p3, p4, p5, p6, p7});
|
||||
|
||||
pass::Manager m;
|
||||
m.register_pass<pass::InitNodeInfo>();
|
||||
@ -117,10 +132,14 @@ TEST(TransformationTests, AssignRegisters2) {
|
||||
// and stored to runtime info
|
||||
{
|
||||
std::map<std::string, size_t> ref_registers {
|
||||
{"r00", 1}, {"r01", 3}, {"r02", 5}, {"r03", 5}, {"r04", 2}, {"r05", 6}, {"r06", 6}, {"r07", 6},
|
||||
{"r08", 5}, {"r09", 2}, {"r10", 1}, {"r11", 4}, {"r12", 4}, {"r13", 6}, {"r14", 2}, {"r15", 5},
|
||||
{"r16", 0}, {"r17", 4}, {"r18", 0}, {"r19", 2}, {"r20", 4}, {"r21", 1}, {"r22", 0}, {"r23", 6},
|
||||
{"r24", 1}
|
||||
{"p00", 0}, {"p01", 1}, {"p02", 2}, {"p03", 3}, {"p04", 4}, {"p05", 5},
|
||||
{"p06", 6}, {"p07", 7},
|
||||
{"r00", 1}, {"r01", 3}, {"r02", 5}, {"r03", 5}, {"r04", 2}, {"r05", 6},
|
||||
{"r06", 6}, {"r07", 6}, {"r08", 5}, {"r09", 2}, {"r10", 1}, {"r11", 4},
|
||||
{"r12", 4}, {"r13", 6}, {"r14", 2}, {"r15", 5}, {"r16", 0}, {"r17", 4},
|
||||
{"r18", 0}, {"r19", 2}, {"r20", 4}, {"r21", 1}, {"r22", 0}, {"r23", 6},
|
||||
{"r24", 1},
|
||||
{"s00", 8},
|
||||
};
|
||||
|
||||
auto total_ops = 0;
|
||||
|
@ -13,6 +13,10 @@
|
||||
#include "jit_eltwise_emitters.hpp"
|
||||
#include "jit_dnnl_emitters.hpp"
|
||||
#include "jit_dnnl_ext_emitters.hpp"
|
||||
#include "jit_conversion_emitters.hpp"
|
||||
|
||||
#include "snippets_transformations/op/load_convert.hpp"
|
||||
#include "snippets_transformations/op/store_convert.hpp"
|
||||
|
||||
#include <ngraph/opsets/opset5.hpp>
|
||||
|
||||
@ -39,25 +43,25 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_
|
||||
: TargetMachine(), h(new jit_snippet()), isa(host_isa) {
|
||||
// data movement
|
||||
jitters[ngraph::opset1::Parameter::get_type_info_static()] = CREATE_EMITTER(NopEmitter);
|
||||
jitters[ngraph::snippets::op::BlockedParameter::get_type_info_static()] = CREATE_EMITTER(NopEmitter);
|
||||
jitters[ngraph::opset1::Result::get_type_info_static()] = CREATE_EMITTER(NopEmitter);
|
||||
// jitters[ngraph::opset1::Constant::get_type_info_static()] = CREATE_EMITTER(); // Not supported
|
||||
|
||||
jitters[ngraph::snippets::op::Load::get_type_info_static()] = CREATE_EMITTER(LoadEmitter);
|
||||
jitters[ngraph::snippets::op::VectorLoad::get_type_info_static()] = CREATE_EMITTER(LoadEmitter);
|
||||
jitters[ngraph::snippets::op::ScalarLoad::get_type_info_static()] = CREATE_EMITTER(ScalarLoadEmitter);
|
||||
jitters[ngraph::snippets::op::BroadcastLoad::get_type_info_static()] = CREATE_EMITTER(BroadcastLoadEmitter);
|
||||
jitters[ov::intel_cpu::LoadConvertSaturation::get_type_info_static()] = CREATE_EMITTER(LoadConvertEmitter);
|
||||
jitters[ov::intel_cpu::LoadConvertTruncation::get_type_info_static()] = CREATE_EMITTER(LoadConvertEmitter);
|
||||
|
||||
jitters[ngraph::snippets::op::Store::get_type_info_static()] = CREATE_EMITTER(StoreEmitter);
|
||||
jitters[ngraph::snippets::op::VectorStore::get_type_info_static()] = CREATE_EMITTER(StoreEmitter);
|
||||
jitters[ngraph::snippets::op::ScalarStore::get_type_info_static()] = CREATE_EMITTER(ScalarStoreEmitter);
|
||||
jitters[ov::intel_cpu::StoreConvertSaturation::get_type_info_static()] = CREATE_EMITTER(StoreConvertEmitter);
|
||||
jitters[ov::intel_cpu::StoreConvertTruncation::get_type_info_static()] = CREATE_EMITTER(StoreConvertEmitter);
|
||||
|
||||
jitters[ngraph::snippets::op::Scalar::get_type_info_static()] = CREATE_EMITTER(ScalarEmitter);
|
||||
jitters[ngraph::snippets::op::BroadcastMove::get_type_info_static()] = CREATE_EMITTER(FakeBroadcastEmitter);
|
||||
jitters[ngraph::snippets::op::BroadcastMove::get_type_info_static()] = CREATE_EMITTER(BroadcastMoveEmitter);
|
||||
// jitters[ngraph::snippets::op::Nop::get_type_info_static()] = CREATE_EMITTER(NopEmitter); // Not supported
|
||||
// jitters[ngraph::opset1::Broadcast::get_type_info_static()] = CREATE_EMITTER(); // Not supported
|
||||
|
||||
// jitters[ngraph::opset1::Convert::get_type_info_static()] = CREATE_EMITTER(); // Not supported
|
||||
jitters[ngraph::snippets::op::ConvertTruncation::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_convert_truncation_emitter);
|
||||
jitters[ngraph::snippets::op::ConvertSaturation::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_convert_saturation_emitter);
|
||||
// jitters[ngraph::opset1::FakeQuantize::get_type_info_static()] = CREATE_EMITTER(); // not supported
|
||||
|
||||
// binary
|
||||
@ -118,6 +122,7 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_
|
||||
|
||||
jitters[ngraph::snippets::op::Kernel::get_type_info_static()] = CREATE_EMITTER(KernelEmitter);
|
||||
jitters[ngraph::snippets::op::Tile::get_type_info_static()] = CREATE_EMITTER(TileEmitter);
|
||||
jitters[ngraph::snippets::op::TileScheduler::get_type_info_static()] = CREATE_EMITTER(TileSchedulerEmitter);
|
||||
}
|
||||
|
||||
size_t ov::intel_cpu::CPUTargetMachine::get_lanes() const {
|
||||
|
313
src/plugins/intel_cpu/src/emitters/jit_conversion_emitters.cpp
Normal file
313
src/plugins/intel_cpu/src/emitters/jit_conversion_emitters.cpp
Normal file
@ -0,0 +1,313 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "jit_conversion_emitters.hpp"
|
||||
#include "utils/bfloat16.hpp"
|
||||
#include <cpu/x64/jit_uni_eltwise.hpp>
|
||||
#include <ngraph/opsets/opset1.hpp>
|
||||
#include <nodes/eltwise.h>
|
||||
|
||||
using namespace InferenceEngine;
|
||||
using namespace dnnl::impl::utils;
|
||||
using namespace dnnl::impl;
|
||||
using namespace dnnl::impl::cpu::x64;
|
||||
using namespace Xbyak;
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
jit_convert_emitter::jit_convert_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr<ngraph::Node>& node, Precision exec_prc)
|
||||
: jit_emitter(host, host_isa, node, exec_prc) {
|
||||
input_type = node->get_input_element_type(0);
|
||||
output_type = node->get_output_element_type(0);
|
||||
|
||||
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
|
||||
emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(host, host_isa));
|
||||
}
|
||||
|
||||
void jit_convert_emitter::validate_types() const {
|
||||
auto is_supported_type = [this](const ov::element::Type& type) {
|
||||
return any_of(supported_types.begin(), supported_types.end(),
|
||||
[&type](const ov::element::Type& supported_type) { return supported_type == type; } );
|
||||
};
|
||||
|
||||
if (!is_supported_type(input_type))
|
||||
IE_THROW() << "Unsupported input type: " << input_type.get_type_name();
|
||||
if (!is_supported_type(output_type))
|
||||
IE_THROW() << "Unsupported output type: " << output_type.get_type_name();
|
||||
}
|
||||
|
||||
size_t jit_convert_emitter::get_inputs_num() const { return 1; }
|
||||
|
||||
void jit_convert_emitter::emit_data() const {
|
||||
jit_emitter::emit_data();
|
||||
if (emu_vcvtneps2bf16)
|
||||
emu_vcvtneps2bf16->emit_data();
|
||||
}
|
||||
|
||||
void jit_convert_emitter::float2bfloat(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
|
||||
Zmm zmm_src = Zmm(in_vec_idxs[0]);
|
||||
Zmm zmm_dst = Zmm(out_vec_idxs[0]);
|
||||
|
||||
if (mayiuse(avx512_core_bf16)) {
|
||||
h->vcvtneps2bf16(zmm_dst, zmm_src);
|
||||
} else {
|
||||
if (!emu_vcvtneps2bf16)
|
||||
IE_THROW() << "Converter from float to bf16 isn't initialized!";
|
||||
|
||||
emu_vcvtneps2bf16->emit_code({static_cast<size_t>(zmm_src.getIdx())}, {static_cast<size_t>(zmm_dst.getIdx())});
|
||||
}
|
||||
}
|
||||
|
||||
jit_convert_truncation_emitter::jit_convert_truncation_emitter(jit_generator *host, cpu_isa_t host_isa,
|
||||
const std::shared_ptr<ngraph::Node>& node, Precision exec_prc)
|
||||
: jit_convert_emitter(host, host_isa, node, exec_prc) {
|
||||
prepare_table();
|
||||
}
|
||||
|
||||
bool jit_convert_truncation_emitter::is_i8_and_u8_case() const {
|
||||
return one_of(input_type, ov::element::i8, ov::element::u8) &&
|
||||
one_of(output_type, ov::element::i8, ov::element::u8);
|
||||
}
|
||||
|
||||
void jit_convert_truncation_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
|
||||
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs,
|
||||
const emitter_context *emit_context) const {
|
||||
validate_types();
|
||||
if (host_isa_ == cpu::x64::sse41) {
|
||||
emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
|
||||
} else if (host_isa_ == cpu::x64::avx2) {
|
||||
emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
|
||||
} else if (host_isa_ == cpu::x64::avx512_core) {
|
||||
emit_isa<cpu::x64::avx512_core>(in_vec_idxs, out_vec_idxs);
|
||||
} else {
|
||||
assert(!"unsupported isa");
|
||||
}
|
||||
}
|
||||
|
||||
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
|
||||
void jit_convert_truncation_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
|
||||
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
|
||||
Vmm vmm_src = Vmm(in_vec_idxs[0]);
|
||||
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
|
||||
|
||||
// For Truncation behavior we can just move data from src to dst if we want convert i8 -> u8 or u8 -> i8
|
||||
if ((input_type == output_type) || is_i8_and_u8_case()) {
|
||||
if (vmm_src != vmm_dst) {
|
||||
h->uni_vmovups(vmm_dst, vmm_src);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
switch (input_type) {
|
||||
case ov::element::f32:
|
||||
if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8))
|
||||
h->uni_vcvttps2dq(vmm_dst, vmm_src);
|
||||
break;
|
||||
case ov::element::i32:
|
||||
if (one_of(output_type, ov::element::f32, ov::element::bf16))
|
||||
h->uni_vcvtdq2ps(vmm_dst, vmm_src);
|
||||
break;
|
||||
case ov::element::bf16:
|
||||
h->vpmovzxwd(vmm_dst, vmm_src);
|
||||
h->uni_vpslld(vmm_dst, vmm_dst, 16);
|
||||
if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8))
|
||||
h->uni_vcvttps2dq(vmm_dst, vmm_dst);
|
||||
break;
|
||||
case ov::element::i8:
|
||||
h->uni_vpmovsxbd(vmm_dst, vmm_src);
|
||||
break;
|
||||
case ov::element::u8:
|
||||
h->uni_vpmovzxbd(vmm_dst, vmm_src);
|
||||
break;
|
||||
default:
|
||||
assert(!"unsupported output data type");
|
||||
}
|
||||
|
||||
switch (output_type) {
|
||||
case ov::element::f32:
|
||||
if (!one_of(input_type, ov::element::i32, ov::element::bf16)) {
|
||||
h->uni_vcvtdq2ps(vmm_dst, vmm_dst);
|
||||
}
|
||||
break;
|
||||
case ov::element::i32:
|
||||
break;
|
||||
case ov::element::bf16:
|
||||
if (input_type == ov::element::f32) {
|
||||
float2bfloat({static_cast<size_t>(vmm_src.getIdx())}, {static_cast<size_t>(vmm_dst.getIdx())});
|
||||
} else {
|
||||
if (one_of(input_type, ov::element::i8, ov::element::u8)) {
|
||||
h->uni_vcvtdq2ps(vmm_dst, vmm_dst);
|
||||
}
|
||||
float2bfloat({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(vmm_dst.getIdx())});
|
||||
}
|
||||
break;
|
||||
case ov::element::i8:
|
||||
case ov::element::u8:
|
||||
if (input_type == ov::element::i32) {
|
||||
dword2int8<isa>({static_cast<size_t>(vmm_src.getIdx())}, {static_cast<size_t>(vmm_dst.getIdx())});
|
||||
} else {
|
||||
dword2int8<isa>({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(vmm_dst.getIdx())});
|
||||
}
|
||||
break;
|
||||
default:
|
||||
assert(!"unsupported output data type");
|
||||
}
|
||||
}
|
||||
|
||||
void jit_convert_truncation_emitter::register_table_entries() {
|
||||
if (host_isa_ == dnnl::impl::cpu::x64::avx2 &&
|
||||
one_of(output_type, ov::element::i8, ov::element::u8) &&
|
||||
!is_i8_and_u8_case())
|
||||
push_arg_entry_of("mask_byte", 0x000000ff, true);
|
||||
}
|
||||
|
||||
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
|
||||
void jit_convert_truncation_emitter::dword2int8(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
|
||||
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
|
||||
Vmm vmm_src = Vmm(in_vec_idxs[0]);
|
||||
|
||||
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
|
||||
Xmm xmm_dst = Xmm(out_vec_idxs[0]);
|
||||
Ymm ymm_dst = Ymm(out_vec_idxs[0]);
|
||||
|
||||
if (isa == dnnl::impl::cpu::x64::avx512_core) {
|
||||
h->vpmovdb(xmm_dst, vmm_src);
|
||||
} else if (isa == dnnl::impl::cpu::x64::avx2) {
|
||||
h->vpand(vmm_dst, vmm_src, table_val("mask_byte")); // to avoid saturation
|
||||
h->uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst);
|
||||
if (isa != dnnl::impl::cpu::x64::sse41)
|
||||
h->vpermq(ymm_dst, ymm_dst, 0x08);
|
||||
h->uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst);
|
||||
}
|
||||
}
|
||||
|
||||
jit_convert_saturation_emitter::jit_convert_saturation_emitter(jit_generator *host, cpu_isa_t host_isa,
|
||||
const std::shared_ptr<ngraph::Node>& node, Precision exec_prc)
|
||||
: jit_convert_emitter(host, host_isa, node, exec_prc) {
|
||||
}
|
||||
|
||||
void jit_convert_saturation_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
|
||||
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs,
|
||||
const emitter_context *emit_context) const {
|
||||
validate_types();
|
||||
if (host_isa_ == cpu::x64::sse41) {
|
||||
emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
|
||||
} else if (host_isa_ == cpu::x64::avx2) {
|
||||
emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
|
||||
} else if (host_isa_ == cpu::x64::avx512_core) {
|
||||
emit_isa<cpu::x64::avx512_core>(in_vec_idxs, out_vec_idxs);
|
||||
} else {
|
||||
assert(!"unsupported isa");
|
||||
}
|
||||
}
|
||||
|
||||
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
|
||||
void jit_convert_saturation_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
|
||||
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
|
||||
Vmm vmm_src = Vmm(in_vec_idxs[0]);
|
||||
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
|
||||
|
||||
if (input_type == output_type) {
|
||||
h->uni_vmovups(vmm_dst, vmm_src);
|
||||
return;
|
||||
}
|
||||
|
||||
switch (input_type) {
|
||||
case ov::element::f32:
|
||||
if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8))
|
||||
h->uni_vcvtps2dq(vmm_dst, vmm_src);
|
||||
break;
|
||||
case ov::element::i32:
|
||||
if (one_of(output_type, ov::element::f32, ov::element::bf16))
|
||||
h->uni_vcvtdq2ps(vmm_dst, vmm_src);
|
||||
break;
|
||||
case ov::element::bf16:
|
||||
h->vpmovzxwd(vmm_dst, vmm_src);
|
||||
h->uni_vpslld(vmm_dst, vmm_dst, 16);
|
||||
if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8))
|
||||
h->uni_vcvttps2dq(vmm_dst, vmm_dst);
|
||||
break;
|
||||
case ov::element::i8:
|
||||
h->uni_vpmovsxbd(vmm_dst, vmm_src);
|
||||
break;
|
||||
case ov::element::u8:
|
||||
h->uni_vpmovzxbd(vmm_dst, vmm_src);
|
||||
break;
|
||||
default:
|
||||
assert(!"unsupported output data type");
|
||||
}
|
||||
|
||||
switch (output_type) {
|
||||
case ov::element::f32:
|
||||
if (!one_of(input_type, ov::element::i32, ov::element::bf16)) {
|
||||
h->uni_vcvtdq2ps(vmm_dst, vmm_dst);
|
||||
}
|
||||
break;
|
||||
case ov::element::i32:
|
||||
break;
|
||||
case ov::element::bf16:
|
||||
if (input_type == ov::element::f32) {
|
||||
float2bfloat({static_cast<size_t>(vmm_src.getIdx())}, {static_cast<size_t>(vmm_dst.getIdx())});
|
||||
} else {
|
||||
if (one_of(input_type, ov::element::i8, ov::element::u8)) {
|
||||
h->uni_vcvtdq2ps(vmm_dst, vmm_dst);
|
||||
}
|
||||
float2bfloat({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(vmm_dst.getIdx())});
|
||||
}
|
||||
break;
|
||||
case ov::element::i8:
|
||||
case ov::element::u8:
|
||||
if (input_type == ov::element::i32) {
|
||||
dword2int8<isa>({static_cast<size_t>(vmm_src.getIdx())}, {static_cast<size_t>(vmm_dst.getIdx())}, output_type.is_signed());
|
||||
} else {
|
||||
dword2int8<isa>({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(vmm_dst.getIdx())}, output_type.is_signed());
|
||||
}
|
||||
break;
|
||||
default:
|
||||
assert(!"unsupported output data type");
|
||||
}
|
||||
}
|
||||
|
||||
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
|
||||
void jit_convert_saturation_emitter::dword2int8(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs, bool is_signed) const {
|
||||
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
|
||||
Vmm vmm_src = Vmm(in_vec_idxs[0]);
|
||||
|
||||
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
|
||||
Xmm xmm_dst = Xmm(out_vec_idxs[0]);
|
||||
Ymm ymm_dst = Ymm(out_vec_idxs[0]);
|
||||
|
||||
if (isa == dnnl::impl::cpu::x64::avx512_core) {
|
||||
if (is_signed) {
|
||||
h->vpmovsdb(xmm_dst, vmm_src);
|
||||
} else {
|
||||
Vmm vmm_zero = Vmm(aux_vec_idxs[0]);
|
||||
h->vpxord(vmm_zero, vmm_zero, vmm_zero);
|
||||
h->vpmaxsd(vmm_dst, vmm_src, vmm_zero);
|
||||
h->vpmovusdb(xmm_dst, vmm_dst);
|
||||
}
|
||||
} else {
|
||||
if (is_signed)
|
||||
h->uni_vpackssdw(vmm_dst, vmm_src, vmm_src);
|
||||
else
|
||||
h->uni_vpackusdw(vmm_dst, vmm_src, vmm_src);
|
||||
|
||||
if (isa != dnnl::impl::cpu::x64::sse41)
|
||||
h->vpermq(ymm_dst, ymm_dst, 0x08);
|
||||
|
||||
if (is_signed)
|
||||
h->uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst);
|
||||
else
|
||||
h->uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst);
|
||||
}
|
||||
}
|
||||
|
||||
size_t jit_convert_saturation_emitter::aux_vecs_count() const {
|
||||
// 1 register is for dword2int8 unsigned
|
||||
return output_type == ov::element::u8 && host_isa_ == dnnl::impl::cpu::x64::avx512_core? 1 : 0;
|
||||
}
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
@ -0,0 +1,87 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cpu/x64/jit_generator.hpp>
|
||||
#include "jit_emitter.hpp"
|
||||
#include "jit_bf16_emitters.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
class jit_convert_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_convert_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
|
||||
const std::shared_ptr<ngraph::Node>& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
|
||||
|
||||
size_t get_inputs_num() const override;
|
||||
|
||||
protected:
|
||||
void emit_data() const override;
|
||||
void validate_types() const;
|
||||
|
||||
void float2bfloat(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
|
||||
|
||||
ov::element::Type input_type;
|
||||
ov::element::Type output_type;
|
||||
|
||||
const ov::element::TypeVector supported_types = {
|
||||
ov::element::f32,
|
||||
ov::element::i32,
|
||||
ov::element::bf16,
|
||||
ov::element::i8,
|
||||
ov::element::u8
|
||||
};
|
||||
|
||||
std::shared_ptr<jit_emu_vcvtneps2bf16> emu_vcvtneps2bf16 = nullptr;
|
||||
};
|
||||
|
||||
// This emitter is covered by specification of "Convert" operation. The implementation uses a "warp-around" conversion.
|
||||
// Example:
|
||||
// int32_t -> int8_t
|
||||
// 129 -> -127
|
||||
class jit_convert_truncation_emitter : public jit_convert_emitter {
|
||||
public:
|
||||
jit_convert_truncation_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
|
||||
const std::shared_ptr<ngraph::Node>& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
|
||||
|
||||
private:
|
||||
void emit_impl(const std::vector<size_t>& in, const std::vector<size_t>& out,
|
||||
const std::vector<size_t>& pool, const std::vector<size_t>& gpr,
|
||||
const ov::intel_cpu::emitter_context *emit_context) const override;
|
||||
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
|
||||
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
|
||||
|
||||
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
|
||||
void dword2int8(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
|
||||
|
||||
bool is_i8_and_u8_case() const;
|
||||
void register_table_entries() override;
|
||||
};
|
||||
|
||||
// This emitter is covered by the common dnnl behavior. The implementation uses a "saturation" conversion.
|
||||
// Example:
|
||||
// int32_t -> int8_t
|
||||
// 129 -> 127
|
||||
class jit_convert_saturation_emitter : public jit_convert_emitter {
|
||||
public:
|
||||
jit_convert_saturation_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
|
||||
const std::shared_ptr<ngraph::Node>& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
|
||||
|
||||
private:
|
||||
void emit_impl(const std::vector<size_t>& in, const std::vector<size_t>& out,
|
||||
const std::vector<size_t>& pool, const std::vector<size_t>& gpr,
|
||||
const ov::intel_cpu::emitter_context *emit_context) const override;
|
||||
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
|
||||
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
|
||||
|
||||
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
|
||||
void dword2int8(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs, bool is_signed) const;
|
||||
|
||||
size_t aux_vecs_count() const override;
|
||||
};
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
@ -46,6 +46,10 @@ size_t jit_emitter::aux_vecs_count() const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
emitter_in_out_map jit_emitter::get_in_out_type() const {
|
||||
return in_out_type_;
|
||||
}
|
||||
|
||||
size_t jit_emitter::aux_gprs_count() const {
|
||||
// We need one gpr to load table address
|
||||
return entry_map_.empty() ? 0 : 1;
|
||||
|
@ -55,6 +55,7 @@ public:
|
||||
const std::vector<size_t> &pool_vec_idxs = {}, const std::vector<size_t> &pool_gpr_idxs = {});
|
||||
virtual size_t get_inputs_num() const = 0;
|
||||
virtual size_t aux_vecs_count() const;
|
||||
emitter_in_out_map get_in_out_type() const;
|
||||
static std::set<InferenceEngine::Precision> get_supported_precisions();
|
||||
|
||||
protected:
|
||||
|
@ -547,8 +547,10 @@ void jit_load_emitter::register_table_entries() {
|
||||
|
||||
/// STORE ///
|
||||
jit_store_emitter::jit_store_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
|
||||
Precision src_prc, Precision dst_prc, int store_num, Precision exec_prc, emitter_in_out_map in_out_type)
|
||||
: jit_emitter(host, host_isa, exec_prc, in_out_type), store_num_(store_num), src_prc_(src_prc), dst_prc_(dst_prc), name_("unknown") {
|
||||
Precision src_prc, Precision dst_prc, int store_num, arithmetic_mode mode, Precision exec_prc,
|
||||
emitter_in_out_map in_out_type)
|
||||
: jit_emitter(host, host_isa, exec_prc, in_out_type), store_num_(store_num), src_prc_(src_prc), dst_prc_(dst_prc), mode_(mode), name_("unknown") {
|
||||
prepare_table();
|
||||
v_len_elt_ = get_vec_length() / exec_prc.size();
|
||||
store_size_ = store_num * dst_prc.size();
|
||||
if (!mayiuse(cpu::x64::avx512_core_bf16) && mayiuse(cpu::x64::avx512_core)) {
|
||||
@ -556,9 +558,25 @@ jit_store_emitter::jit_store_emitter(dnnl::impl::cpu::x64::jit_generator *host,
|
||||
}
|
||||
}
|
||||
|
||||
// 0 for temp reg for mask store for avx512
|
||||
inline bool jit_store_emitter::is_saturation() const {
|
||||
return mode_ == arithmetic_mode::saturation;
|
||||
}
|
||||
|
||||
// case for SSE and AVX2 when we should use AND to truncate values
|
||||
inline bool jit_store_emitter::is_truncation_emulation() const {
|
||||
return !mayiuse(cpu::x64::avx512_core) && !is_saturation() &&
|
||||
src_prc_ != dst_prc_ && one_of(dst_prc_, Precision::U16, Precision::I16, Precision::U8, Precision::I8);
|
||||
}
|
||||
|
||||
size_t jit_store_emitter::aux_gprs_count() const {
|
||||
return get_aux_regs_for_avx512_mask(store_num_ * src_prc_.size());
|
||||
// for temp reg for mask store
|
||||
int count = get_aux_regs_for_avx512_mask(store_num_ * src_prc_.size());
|
||||
|
||||
// for table value in truncation arithmetic mode
|
||||
if (is_truncation_emulation())
|
||||
count++;
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
size_t jit_store_emitter::aux_vecs_count() const {
|
||||
@ -580,6 +598,7 @@ size_t jit_store_emitter::aux_vecs_count() const {
|
||||
size_t jit_store_emitter::get_inputs_num() const { return 1; }
|
||||
|
||||
void jit_store_emitter::emit_data() const {
|
||||
jit_emitter::emit_data();
|
||||
if (emu_vcvtneps2bf16_)
|
||||
emu_vcvtneps2bf16_->emit_data();
|
||||
}
|
||||
@ -618,7 +637,11 @@ void jit_store_emitter::emit_isa(const int in_vec_idx, const Xbyak::Reg64 ®_d
|
||||
switch (src_prc_) {
|
||||
case Precision::FP32:
|
||||
if ((dst_prc_ != Precision::FP32) && (dst_prc_ != Precision::BF16)) {
|
||||
h->uni_vcvtps2dq(Vmm(aux_vec_idxs.back()), Vmm(data_idx));
|
||||
if (is_saturation()) {
|
||||
h->uni_vcvtps2dq(Vmm(aux_vec_idxs.back()), Vmm(data_idx));
|
||||
} else {
|
||||
h->uni_vcvttps2dq(Vmm(aux_vec_idxs.back()), Vmm(data_idx));
|
||||
}
|
||||
data_idx = aux_vec_idxs.back();
|
||||
}
|
||||
break;
|
||||
@ -804,7 +827,7 @@ void jit_store_emitter::store_bytes(const Vmm &vmm, const Xbyak::Reg64 ®, int
|
||||
|
||||
/**
|
||||
* store_dword_to_byte_extension is the utility function to
|
||||
* 1. convert store_num (0 <= store_num <= 16) dwords in the Xmm/Ymm/Zmm to store_num bytes with singed or unsinged saturation.
|
||||
* 1. convert store_num (0 <= store_num <= 16) dwords in the Xmm/Ymm/Zmm to store_num bytes with and without singed or unsinged saturation.
|
||||
* 2. store the packed byte into the memory referenced by ptr[reg + offset] address.
|
||||
*/
|
||||
template <typename Vmm>
|
||||
@ -835,28 +858,37 @@ void jit_store_emitter::store_dword_to_byte_extension(const Vmm &vmm, const Xbya
|
||||
};
|
||||
|
||||
auto store_dword_to_byte_base = [&]() {
|
||||
// db only available on avx512, need dw+wb to emulate
|
||||
if (is_signed)
|
||||
h->uni_vpackssdw(vmm, vmm, vmm);
|
||||
else
|
||||
h->uni_vpackusdw(vmm, vmm, vmm);
|
||||
// gather 2(cross lane) 64 bits into lower vmm to store
|
||||
// [y_3 y_2 y_1 y_0] |--> [y_0 y_0 y_2 y_0]
|
||||
if (is_ymm) {
|
||||
h->vpermq(ymm, ymm, 0x08); // 00001000
|
||||
}
|
||||
if (is_saturation()) {
|
||||
// db only available on avx512, need dw+wb to emulate
|
||||
if (is_signed)
|
||||
h->uni_vpackssdw(vmm, vmm, vmm);
|
||||
else
|
||||
h->uni_vpackusdw(vmm, vmm, vmm);
|
||||
// gather 2(cross lane) 64 bits into lower vmm to store
|
||||
// [y_3 y_2 y_1 y_0] |--> [y_0 y_0 y_2 y_0]
|
||||
if (is_ymm) {
|
||||
h->vpermq(ymm, ymm, 0x08); // 00001000
|
||||
}
|
||||
|
||||
if (is_signed)
|
||||
h->uni_vpacksswb(vmm, vmm, vmm);
|
||||
else
|
||||
if (is_signed)
|
||||
h->uni_vpacksswb(vmm, vmm, vmm);
|
||||
else
|
||||
h->uni_vpackuswb(vmm, vmm, vmm);
|
||||
} else {
|
||||
h->vpand(vmm, vmm, table_val("mask_truncation_byte")); // to avoid saturation
|
||||
h->uni_vpackssdw(vmm, vmm, vmm);
|
||||
if (is_ymm)
|
||||
h->vpermq(ymm, ymm, 0x08);
|
||||
h->uni_vpackuswb(vmm, vmm, vmm);
|
||||
}
|
||||
|
||||
store_bytes(vmm, reg, offset, store_num);
|
||||
};
|
||||
|
||||
switch (store_num) {
|
||||
case 16:
|
||||
// must support avx512F
|
||||
case 16:
|
||||
// must support avx512F
|
||||
if (is_saturation()) {
|
||||
if (is_signed) {
|
||||
h->vpmovsdb(addr(0), vmm);
|
||||
} else {
|
||||
@ -865,9 +897,13 @@ void jit_store_emitter::store_dword_to_byte_extension(const Vmm &vmm, const Xbya
|
||||
h->uni_vpmaxsd(vmm, vmm, zero);
|
||||
h->vpmovusdb(addr(0), vmm);
|
||||
}
|
||||
break;
|
||||
case 8:
|
||||
if (mayiuse(cpu::x64::avx512_core)) { // ymm block on avx512F + VL
|
||||
} else {
|
||||
h->vpmovdb(addr(0), vmm);
|
||||
}
|
||||
break;
|
||||
case 8:
|
||||
if (mayiuse(cpu::x64::avx512_core)) {
|
||||
if (is_saturation()) { // ymm block on avx512F + VL
|
||||
if (is_signed) {
|
||||
h->vpmovsdb(addr(0), ymm);
|
||||
} else {
|
||||
@ -877,11 +913,15 @@ void jit_store_emitter::store_dword_to_byte_extension(const Vmm &vmm, const Xbya
|
||||
h->vpmovusdb(addr(0), ymm);
|
||||
}
|
||||
} else {
|
||||
store_dword_to_byte_base();
|
||||
h->vpmovdb(addr(0), ymm);
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
if (mayiuse(cpu::x64::avx512_core)) { // xmm block on avx512F + VL
|
||||
} else {
|
||||
store_dword_to_byte_base();
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
if (mayiuse(cpu::x64::avx512_core)) {
|
||||
if (is_saturation()) {// xmm block on avx512F + VL
|
||||
if (is_signed) {
|
||||
h->vpmovsdb(addr(0), xmm);
|
||||
} else {
|
||||
@ -891,15 +931,19 @@ void jit_store_emitter::store_dword_to_byte_extension(const Vmm &vmm, const Xbya
|
||||
h->vpmovusdb(addr(0), xmm);
|
||||
}
|
||||
} else {
|
||||
store_dword_to_byte_base();
|
||||
h->vpmovdb(addr(0), xmm);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
if (is_zmm) { // avx512F
|
||||
unsigned int mask = 1;
|
||||
mask = (mask << store_num) - mask;
|
||||
h->mov(Reg32(aux_gpr_idxs[0]), mask);
|
||||
h->kmovw(k_mask, Reg32(aux_gpr_idxs[0]));
|
||||
} else {
|
||||
store_dword_to_byte_base();
|
||||
}
|
||||
break;
|
||||
default:
|
||||
if (is_zmm) { // avx512F
|
||||
unsigned int mask = 1;
|
||||
mask = (mask << store_num) - mask;
|
||||
h->mov(Reg32(aux_gpr_idxs[0]), mask);
|
||||
h->kmovw(k_mask, Reg32(aux_gpr_idxs[0]));
|
||||
if (is_saturation()) {
|
||||
if (is_signed) {
|
||||
h->vpmovsdb(addr(0), vmm | k_mask);
|
||||
} else {
|
||||
@ -909,9 +953,12 @@ void jit_store_emitter::store_dword_to_byte_extension(const Vmm &vmm, const Xbya
|
||||
h->vpmovusdb(addr(0), vmm | k_mask);
|
||||
}
|
||||
} else {
|
||||
store_dword_to_byte_base();
|
||||
h->vpmovdb(addr(0), vmm | k_mask);
|
||||
}
|
||||
break;
|
||||
} else {
|
||||
store_dword_to_byte_base();
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@ -946,16 +993,21 @@ void jit_store_emitter::store_dword_to_word_extension(const Vmm &vmm, const Xbya
|
||||
auto zmm = Xbyak::Zmm(vmm.getIdx());
|
||||
|
||||
auto store_dword_to_word_base = [&]() {
|
||||
// direct mov_dw available only on avx512, emulate with pack_dw + permute + pure store
|
||||
if (is_signed)
|
||||
h->uni_vpackssdw(vmm, vmm, vmm);
|
||||
else
|
||||
// direct mov_dw available only on avx512
|
||||
if (is_saturation()) { // emulate with pack_dw + permute + pure store for saturation mode
|
||||
if (is_signed)
|
||||
h->uni_vpackssdw(vmm, vmm, vmm);
|
||||
else
|
||||
h->uni_vpackusdw(vmm, vmm, vmm);
|
||||
// gather 2/4(cross lane) 64 bits into lower vmm to store
|
||||
// [y_3 y_2 y_1 y_0] |--> [y_0 y_0 y_2 y_0]
|
||||
// [ 128 | 128 ] |--> [ 128 | 128 ]
|
||||
if (is_ymm) {
|
||||
h->vpermq(ymm, ymm, 0x08); // 00001000
|
||||
}
|
||||
} else { // emulate with AND + pure store for truncation mode
|
||||
h->vpand(vmm, vmm, table_val("mask_truncation_word"));
|
||||
h->uni_vpackusdw(vmm, vmm, vmm);
|
||||
// gather 2/4(cross lane) 64 bits into lower vmm to store
|
||||
// [y_3 y_2 y_1 y_0] |--> [y_0 y_0 y_2 y_0]
|
||||
// [ 128 | 128 ] |--> [ 128 | 128 ]
|
||||
if (is_ymm) {
|
||||
h->vpermq(ymm, ymm, 0x08); // 00001000
|
||||
}
|
||||
|
||||
store_bytes(vmm, reg, offset, store_num * 2);
|
||||
@ -978,7 +1030,8 @@ void jit_store_emitter::store_dword_to_word_extension(const Vmm &vmm, const Xbya
|
||||
}
|
||||
} else {
|
||||
switch (store_num) {
|
||||
case 16:
|
||||
case 16:
|
||||
if (is_saturation()) {
|
||||
if (is_signed) {
|
||||
h->vpmovsdw(ptr[reg + offset], vmm); // singed int32 saturate to signed int16.
|
||||
} else {
|
||||
@ -987,9 +1040,13 @@ void jit_store_emitter::store_dword_to_word_extension(const Vmm &vmm, const Xbya
|
||||
h->uni_vpmaxsd(vmm, zero, vmm); // if singed bit is 1, set value as 0.
|
||||
h->vpmovusdw(ptr[reg + offset], vmm); // unsinged int32 saturate to unsigned int16.
|
||||
}
|
||||
break;
|
||||
case 8:
|
||||
if (mayiuse(cpu::x64::avx512_core)) {
|
||||
} else {
|
||||
h->vpmovdw(ptr[reg + offset], vmm);
|
||||
}
|
||||
break;
|
||||
case 8:
|
||||
if (mayiuse(cpu::x64::avx512_core)) {
|
||||
if (is_saturation()) {
|
||||
if (is_signed) {
|
||||
h->vpmovsdw(ptr[reg + offset], ymm);
|
||||
} else {
|
||||
@ -999,11 +1056,15 @@ void jit_store_emitter::store_dword_to_word_extension(const Vmm &vmm, const Xbya
|
||||
h->vpmovusdw(ptr[reg + offset], ymm);
|
||||
}
|
||||
} else {
|
||||
store_dword_to_word_base();
|
||||
h->vpmovdw(ptr[reg + offset], ymm);
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
if (mayiuse(cpu::x64::avx512_core)) {
|
||||
} else {
|
||||
store_dword_to_word_base();
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
if (mayiuse(cpu::x64::avx512_core)) {
|
||||
if (is_saturation()) {
|
||||
if (is_signed) {
|
||||
h->vpmovsdw(ptr[reg + offset], xmm);
|
||||
} else {
|
||||
@ -1013,15 +1074,19 @@ void jit_store_emitter::store_dword_to_word_extension(const Vmm &vmm, const Xbya
|
||||
h->vpmovusdw(ptr[reg + offset], xmm);
|
||||
}
|
||||
} else {
|
||||
store_dword_to_word_base();
|
||||
h->vpmovdw(ptr[reg + offset], xmm);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
if (is_zmm) {
|
||||
unsigned int mask = 1;
|
||||
mask = (mask << store_num) - mask;
|
||||
h->mov(Reg32(aux_gpr_idxs[0]), mask);
|
||||
h->kmovw(k_mask, Reg32(aux_gpr_idxs[0]));
|
||||
} else {
|
||||
store_dword_to_word_base();
|
||||
}
|
||||
break;
|
||||
default:
|
||||
if (is_zmm) {
|
||||
unsigned int mask = 1;
|
||||
mask = (mask << store_num) - mask;
|
||||
h->mov(Reg32(aux_gpr_idxs[0]), mask);
|
||||
h->kmovw(k_mask, Reg32(aux_gpr_idxs[0]));
|
||||
if (is_saturation()) {
|
||||
if (is_signed) {
|
||||
h->vpmovsdw(ptr[reg + offset], vmm | k_mask);
|
||||
} else {
|
||||
@ -1031,12 +1096,22 @@ void jit_store_emitter::store_dword_to_word_extension(const Vmm &vmm, const Xbya
|
||||
h->vpmovusdw(ptr[reg + offset], vmm | k_mask);
|
||||
}
|
||||
} else {
|
||||
store_dword_to_word_base();
|
||||
h->vpmovdw(ptr[reg + offset], vmm | k_mask);
|
||||
}
|
||||
break;
|
||||
} else {
|
||||
store_dword_to_word_base();
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void jit_store_emitter::register_table_entries() {
|
||||
if (is_truncation_emulation()) {
|
||||
push_arg_entry_of("mask_truncation_byte", 0x000000ff, true);
|
||||
push_arg_entry_of("mask_truncation_word", 0x0000ffff, true);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
||||
|
@ -39,6 +39,12 @@ struct store_emitter_params : public emitter_params {
|
||||
int store_num_;
|
||||
};
|
||||
|
||||
// Arithmetic modes for data type conversion in store_emitter
|
||||
enum arithmetic_mode {
|
||||
saturation,
|
||||
truncation
|
||||
};
|
||||
|
||||
class jit_load_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_load_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, Precision src_prc, Precision dst_prc, int load_num,
|
||||
@ -101,7 +107,8 @@ private:
|
||||
class jit_store_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_store_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, Precision src_prc, Precision dst_prc, int store_num,
|
||||
Precision exec_prc = Precision::FP32, emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_gpr);
|
||||
arithmetic_mode mode = arithmetic_mode::saturation, Precision exec_prc = Precision::FP32,
|
||||
emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_gpr);
|
||||
|
||||
/**
|
||||
* store_num values with src_prc in Vmm[in_vec_idx] is stored to ptr[reg_dst + offset_byte] address as dst_prc data, where offset_byte is in_idxs[1]
|
||||
@ -143,15 +150,21 @@ private:
|
||||
template <typename Vmm>
|
||||
void store_dword_to_word_extension(const Vmm &vmm, const Xbyak::Reg64 ®, int offset, bool is_bf16, bool is_signed, int store_size) const;
|
||||
|
||||
void register_table_entries() override;
|
||||
|
||||
size_t aux_gprs_count() const override;
|
||||
size_t aux_vecs_count() const override;
|
||||
|
||||
inline bool is_saturation() const;
|
||||
inline bool is_truncation_emulation() const;
|
||||
|
||||
std::string name_;
|
||||
int v_len_elt_; // 4/8/16
|
||||
int store_num_;
|
||||
int store_size_;
|
||||
Precision src_prc_;
|
||||
Precision dst_prc_;
|
||||
arithmetic_mode mode_ = arithmetic_mode::saturation;
|
||||
std::shared_ptr<jit_emu_vcvtneps2bf16> emu_vcvtneps2bf16_;
|
||||
};
|
||||
|
||||
|
671
src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp
Normal file
671
src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp
Normal file
@ -0,0 +1,671 @@
|
||||
// Copyright (C) 2020-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <ngraph/rt_info.hpp>
|
||||
#include <ngraph/variant.hpp>
|
||||
#include <cpu/x64/jit_generator.hpp>
|
||||
|
||||
#include "jit_snippets_emitters.hpp"
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
inline static void transform_idxs_to_regs(const std::vector<size_t>& idxs, std::vector<Reg64>& regs) {
|
||||
regs.resize(idxs.size());
|
||||
std::transform(idxs.begin(), idxs.end(), regs.begin(), [](size_t idx){return Reg64(static_cast<int>(idx));});
|
||||
}
|
||||
|
||||
jit_container_emitter::jit_container_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
|
||||
const std::shared_ptr<ov::Node>& n) : jit_emitter(h, isa, n) {
|
||||
in_out_type_ = emitter_in_out_map::gpr_to_gpr;
|
||||
}
|
||||
|
||||
void jit_container_emitter::map_abstract_registers(const std::vector<size_t> &vec_pool, const std::vector<size_t> &gpr_pool,
|
||||
std::set<size_t>& vecs_used, std::set<size_t>& gprs_used) {
|
||||
if (body.empty())
|
||||
IE_THROW() << "Cannot map registers for jit_container_emitter when its body is empty";
|
||||
auto abstract_to_physical = [](const std::vector<size_t>& abstract_regs, const std::vector<size_t>& regs_pool) {
|
||||
std::vector<size_t> physical_regs(abstract_regs.size());
|
||||
for (size_t i = 0; i < abstract_regs.size(); i++)
|
||||
physical_regs[i] = regs_pool.at(abstract_regs[i]);
|
||||
return physical_regs;
|
||||
};
|
||||
for (auto& code : body) {
|
||||
const auto& emitter = code.first;
|
||||
std::vector<size_t> in_abstract_regs, out_abstract_regs;
|
||||
std::tie(in_abstract_regs, out_abstract_regs) = code.second;
|
||||
std::vector<size_t> in_physical_regs, out_physical_regs;
|
||||
switch (std::dynamic_pointer_cast<jit_emitter>(emitter)->get_in_out_type()) {
|
||||
case gpr_to_gpr:
|
||||
// Note that gpr_to_gpr is used for high-level utility operations like Kernel/TileScheduler/Tile.
|
||||
// Input registers are not mapped in this case, since they contain utility info
|
||||
// (num_params, tile increment, etc.), but not reg indexes.
|
||||
in_physical_regs = std::move(in_abstract_regs);
|
||||
out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, gpr_pool));
|
||||
gprs_used.insert(out_physical_regs.begin(), out_physical_regs.end());
|
||||
break;
|
||||
case gpr_to_vec:
|
||||
// Load Emitters
|
||||
in_physical_regs = std::move(abstract_to_physical(in_abstract_regs, gpr_pool));
|
||||
out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, vec_pool));
|
||||
gprs_used.insert(in_physical_regs.begin(), in_physical_regs.end());
|
||||
vecs_used.insert(out_physical_regs.begin(), out_physical_regs.end());
|
||||
break;
|
||||
case vec_to_gpr:
|
||||
// Store Emitters
|
||||
in_physical_regs = std::move(abstract_to_physical(in_abstract_regs, vec_pool));
|
||||
out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, gpr_pool));
|
||||
vecs_used.insert(in_physical_regs.begin(), in_physical_regs.end());
|
||||
gprs_used.insert(out_physical_regs.begin(), out_physical_regs.end());
|
||||
break;
|
||||
case vec_to_vec:
|
||||
// Regular operations
|
||||
in_physical_regs = std::move(abstract_to_physical(in_abstract_regs, vec_pool));
|
||||
out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, vec_pool));
|
||||
vecs_used.insert(in_physical_regs.begin(), in_physical_regs.end());
|
||||
vecs_used.insert(out_physical_regs.begin(), out_physical_regs.end());
|
||||
break;
|
||||
default:
|
||||
IE_THROW() << "Unhandled in_out type";
|
||||
}
|
||||
code.second = std::make_pair(in_physical_regs, out_physical_regs);
|
||||
if (auto container = std::dynamic_pointer_cast<jit_container_emitter>(code.first))
|
||||
container->map_abstract_registers(vec_pool, gpr_pool, vecs_used, gprs_used);
|
||||
}
|
||||
}
|
||||
|
||||
KernelEmitter::KernelEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
|
||||
const std::shared_ptr<ov::Node>& n) : jit_container_emitter(h, isa, n) {
|
||||
const auto kernel = ov::as_type_ptr<ngraph::snippets::op::Kernel>(n);
|
||||
if (!kernel)
|
||||
IE_THROW() << "KernelEmitter invoked with invalid op argument";
|
||||
if (kernel->region.empty())
|
||||
IE_THROW() << "KernelEmitter invoked with empty body";
|
||||
body = kernel->region;
|
||||
if (!kernel->compile_params)
|
||||
IE_THROW() << "KernelEmitter invoked without compile_params";
|
||||
jcp = *reinterpret_cast<const jit_snippets_compile_args*>(kernel->compile_params);
|
||||
// Initialize pools of gp and vec registers
|
||||
gp_regs_pool.resize(16);
|
||||
vec_regs_pool.resize(16);
|
||||
std::iota(gp_regs_pool.begin(), gp_regs_pool.end(), 0);
|
||||
std::iota(vec_regs_pool.begin(), vec_regs_pool.end(), 0);
|
||||
auto remove_regs_from_pool = [](std::vector<size_t>& pool, const std::set<size_t>& to_remove) {
|
||||
// It's important to keep the order of other elements
|
||||
pool.erase(std::remove_if(pool.begin(), pool.end(),
|
||||
[&](size_t x) {return to_remove.count(x) != 0;}), pool.end());
|
||||
};
|
||||
// Reserve stack base and pointer for push(...) and pop(...) operations
|
||||
// Reserve abi_param1 and abi_param2, since they'll be used to pass runtime call args to kernel
|
||||
remove_regs_from_pool(gp_regs_pool, {Xbyak::Operand::RSP, Xbyak::Operand::RBP,
|
||||
static_cast<size_t>(abi_param1.getIdx()),
|
||||
static_cast<size_t>(abi_param2.getIdx())});
|
||||
std::set<size_t> vecs_used, gprs_used;
|
||||
map_abstract_registers(vec_regs_pool, gp_regs_pool, vecs_used, gprs_used);
|
||||
remove_regs_from_pool(gp_regs_pool, gprs_used);
|
||||
remove_regs_from_pool(vec_regs_pool, vecs_used);
|
||||
// Remember used gprs to pass it to the TileSchedulerEmitter, so it can init them with appropriate data ptrs
|
||||
gp_regs_used = std::vector<size_t>(gprs_used.begin(), gprs_used.end());
|
||||
}
|
||||
|
||||
void KernelEmitter::emit_code(const std::vector<size_t> &in,
|
||||
const std::vector<size_t> &out,
|
||||
const std::vector<size_t> &pool,
|
||||
const std::vector<size_t> &gpr) const {
|
||||
validate_arguments(in, out, pool, gpr);
|
||||
emit_impl(in, out, pool, gpr, nullptr);
|
||||
}
|
||||
|
||||
void KernelEmitter::validate_arguments(const std::vector<size_t> &in,
|
||||
const std::vector<size_t> &out,
|
||||
const std::vector<size_t> &pool,
|
||||
const std::vector<size_t> &gpr) const {
|
||||
if (in.size() != 2)
|
||||
IE_THROW() << "KernelEmitter got invalid number of inputs. Expected 2, got " << in.size();
|
||||
if (!out.empty())
|
||||
IE_THROW() << "KKernelEmitter got invalid number of outputs. Expected 0, got " << out.size();
|
||||
}
|
||||
|
||||
void KernelEmitter::init_data_pointers(size_t num_inputs, size_t num_params,
|
||||
const Reg64& reg_indexes, const Reg64& reg_const_params, const std::vector<Reg64>& data_ptr_regs) const {
|
||||
const int64_t harness_num_dims = jcp.output_dims.size() - 1;
|
||||
auto init_ptrs_with_offsets = [&](Reg64 pointer, const int64_t *offsets, Reg64 reg_tmp) {
|
||||
for (int j = 0; j < harness_num_dims; j++) {
|
||||
if (jcp.output_dims[j] != 1 && offsets[j] != 0) {
|
||||
h->mov(reg_tmp, offsets[j]);
|
||||
h->imul(reg_tmp, h->ptr[reg_indexes + j * sizeof(size_t)]);
|
||||
h->add(pointer, reg_tmp);
|
||||
}
|
||||
}
|
||||
};
|
||||
for (auto i = 0; i < num_params; i++) {
|
||||
if (i < num_inputs)
|
||||
h->mov(data_ptr_regs[i], h->ptr[reg_const_params + GET_OFF(src_ptrs) + i * sizeof(void*)]);
|
||||
else
|
||||
h->mov(data_ptr_regs[i], h->ptr[reg_const_params + GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)]);
|
||||
// we can use the last data_ptr_reg as tmp_reg until the last iteration, and reg_const_params then
|
||||
Reg64 reg_tmp = i < num_params-1 ? data_ptr_regs.back() : reg_const_params;
|
||||
init_ptrs_with_offsets(data_ptr_regs[i], &jcp.data_offsets[i * harness_num_dims], reg_tmp);
|
||||
}
|
||||
}
|
||||
void KernelEmitter::emit_impl(const std::vector<size_t>& in,
|
||||
const std::vector<size_t>& out,
|
||||
const std::vector<size_t>& allocated_vec_regs,
|
||||
const std::vector<size_t>& allocated_gp_regs,
|
||||
const ov::intel_cpu::emitter_context *emit_context) const {
|
||||
h->preamble();
|
||||
|
||||
const size_t num_inputs = in[0];
|
||||
const size_t num_outputs = in[1];
|
||||
|
||||
Reg64 reg_indexes = Reg64(abi_param1.getIdx());
|
||||
Reg64 reg_const_params = Reg64(abi_param2.getIdx());
|
||||
std::vector<Reg64> data_ptr_regs;
|
||||
transform_idxs_to_regs(gp_regs_used, data_ptr_regs);
|
||||
|
||||
init_data_pointers(num_inputs, num_inputs + num_outputs, reg_indexes, reg_const_params, data_ptr_regs);
|
||||
// todo: emit_impl is a const method, so we can't just push_back unused regs to the gp_regs_pool.
|
||||
// we need a more elegant approach to avoid a full copy here
|
||||
auto local_gpr_pool = gp_regs_pool;
|
||||
local_gpr_pool.push_back(static_cast<size_t>(reg_indexes.getIdx()));
|
||||
local_gpr_pool.push_back(static_cast<size_t>(reg_const_params.getIdx()));
|
||||
for (const auto& c : body) {
|
||||
const auto& emitter = c.first;
|
||||
std::vector<size_t> in_regs, out_regs;
|
||||
std::tie(in_regs, out_regs) = c.second;
|
||||
if (auto tile_scheduler = std::dynamic_pointer_cast<TileSchedulerEmitter>(emitter))
|
||||
out_regs = gp_regs_used;
|
||||
emitter->emit_code(in_regs, out_regs, vec_regs_pool, local_gpr_pool);
|
||||
}
|
||||
h->postamble();
|
||||
}
|
||||
|
||||
TileSchedulerEmitter::TileSchedulerEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
|
||||
const std::shared_ptr<ov::Node>& n) : jit_container_emitter(h, isa, n) {
|
||||
const auto tile_scheduler = ov::as_type_ptr<ngraph::snippets::op::TileScheduler>(n);
|
||||
if (!tile_scheduler)
|
||||
IE_THROW() << "TileSchedulerEmitter invoked with invalid op argument";
|
||||
if (!tile_scheduler->compile_params)
|
||||
IE_THROW() << "TileEmitter invoked without compile_params";
|
||||
body = {tile_scheduler->vector_region, tile_scheduler->scalar_region};
|
||||
jcp = *reinterpret_cast<const jit_snippets_compile_args*>(tile_scheduler->compile_params);
|
||||
}
|
||||
void TileSchedulerEmitter::emit_code(const std::vector<size_t> &in,
|
||||
const std::vector<size_t> &out,
|
||||
const std::vector<size_t> &pool,
|
||||
const std::vector<size_t> &gpr) const {
|
||||
validate_arguments(in, out, pool, gpr);
|
||||
emit_impl(in, out, pool, gpr, nullptr);
|
||||
}
|
||||
void TileSchedulerEmitter::validate_arguments(const std::vector<size_t> &in,
|
||||
const std::vector<size_t> &out,
|
||||
const std::vector<size_t> &pool,
|
||||
const std::vector<size_t> &gpr) const {
|
||||
if (in.size() != 3)
|
||||
IE_THROW() << "TileSchedulerEmitter got invalid number of inputs. Expected 3, got " << in.size();
|
||||
if (out.size() != in[0] + in[1])
|
||||
IE_THROW() << "TileSchedulerEmitter got invalid number of outputs. Expected " << in[0] + in[1] << " , got " << out.size();
|
||||
if (body.size() != 2)
|
||||
IE_THROW() << "TileSchedulerEmitter got invalid body size, expected 2 (vector & scalar TileEmitter), got " << body.size();
|
||||
if (!(std::dynamic_pointer_cast<TileEmitter>(body[0].first) && std::dynamic_pointer_cast<TileEmitter>(body[1].first)))
|
||||
IE_THROW() << "TileSchedulerEmitter can contain only TileEmitters inside its body";
|
||||
}
|
||||
|
||||
void TileSchedulerEmitter::emit_tiles(const Reg64& reg_inner_amount, const std::vector<Reg64>& data_ptr_regs, size_t vector_size,
|
||||
const std::vector<size_t>& vec_pool, const std::vector<size_t>& gpr_pool) const {
|
||||
// TileAllocatedEmitter is just an alias to perform dynamic_pointer_cast only once and reuse it below several times
|
||||
using TileAllocatedEmitter = std::pair<std::shared_ptr<TileEmitter>, const ngraph::snippets::RegInfo&>;
|
||||
TileAllocatedEmitter vector_tile {std::dynamic_pointer_cast<TileEmitter>(body[0].first), body[0].second};
|
||||
TileAllocatedEmitter scalar_tile {std::dynamic_pointer_cast<TileEmitter>(body[1].first), body[1].second};
|
||||
const size_t inner_work_amount = jcp.scheduler_dims[1];
|
||||
auto process_tile =
|
||||
[&](const bool evaluate_once, const TileAllocatedEmitter& tile) {
|
||||
// If Tile is evaluated only once, then we can emit its body directly and skip work_amount decrements and checks
|
||||
if (evaluate_once) {
|
||||
tile.first->emit_body(vec_pool, gpr_pool);
|
||||
} else {
|
||||
std::vector<size_t> in_regs, out_regs;
|
||||
std::tie(in_regs, out_regs) = tile.second;
|
||||
// pass work_amount reg to Tile
|
||||
in_regs.push_back(static_cast<size_t>(reg_inner_amount.getIdx()));
|
||||
for (const auto& reg : data_ptr_regs)
|
||||
out_regs.emplace_back(reg.getIdx());
|
||||
tile.first->emit_code(in_regs, out_regs, vec_pool, gpr_pool);
|
||||
}
|
||||
};
|
||||
// todo: these optimizations should be performed on using Tile graph representation in the future
|
||||
bool vector_evaluate_once = false;
|
||||
if (inner_work_amount >= vector_size) {
|
||||
vector_evaluate_once = inner_work_amount < 2 * vector_size;
|
||||
// Need to set proper work amount for inner tiles if evaluated multiple times
|
||||
if (!vector_evaluate_once)
|
||||
h->mov(reg_inner_amount, inner_work_amount);
|
||||
process_tile(vector_evaluate_once, vector_tile);
|
||||
}
|
||||
if (inner_work_amount % vector_size >= 1) {
|
||||
bool scalar_evaluate_once = inner_work_amount % vector_size < 2;
|
||||
if (!scalar_evaluate_once) {
|
||||
// vector_tile is not executed, work_amount is not set
|
||||
if (inner_work_amount < vector_size) {
|
||||
h->mov(reg_inner_amount, inner_work_amount);
|
||||
// vector_tile is executed, but work_amount is neither set nor decremented appropriately.
|
||||
} else if (vector_evaluate_once) {
|
||||
vector_tile.first->emit_ptr_increments(data_ptr_regs);
|
||||
h->mov(reg_inner_amount, inner_work_amount - vector_size);
|
||||
}
|
||||
// else: vector_tile is executed multiple times, so work_amount is already set
|
||||
} else {
|
||||
if (vector_evaluate_once) {
|
||||
vector_tile.first->emit_ptr_increments(data_ptr_regs);
|
||||
}
|
||||
}
|
||||
process_tile(scalar_evaluate_once, scalar_tile);
|
||||
}
|
||||
}
|
||||
|
||||
void TileSchedulerEmitter::emit_impl(const std::vector<size_t>& in,
|
||||
const std::vector<size_t>& out,
|
||||
const std::vector<size_t>& vec_pool,
|
||||
const std::vector<size_t>& gpr_pool,
|
||||
const ov::intel_cpu::emitter_context *emit_context) const {
|
||||
const size_t num_inputs = in[0];
|
||||
const size_t num_outputs = in[1];
|
||||
const size_t vector_size = in[2];
|
||||
const size_t num_params = num_inputs + num_outputs;
|
||||
const auto& data_ptr_reg_idxs(out);
|
||||
std::vector<Reg64> data_ptr_regs;
|
||||
transform_idxs_to_regs(data_ptr_reg_idxs, data_ptr_regs);
|
||||
// todo: emit_impl has const input args, so we can't just pop_back necessary regs from gpr_pool.
|
||||
// we need a more elegant approach to avoid a full copy here. Similar problem is demonstrated in KernelEmitter
|
||||
auto local_gpr_pool = gpr_pool;
|
||||
Reg64 reg_outer_amount = Reg64(static_cast<int>(local_gpr_pool.back()));
|
||||
local_gpr_pool.pop_back();
|
||||
Reg64 reg_inner_amount = Reg64(static_cast<int>(local_gpr_pool.back()));
|
||||
local_gpr_pool.pop_back();
|
||||
Label for_body;
|
||||
const size_t outer_work_amount = jcp.scheduler_dims[0];
|
||||
if (outer_work_amount == 1) {
|
||||
// emit code directly without looping over external dim
|
||||
emit_tiles(reg_inner_amount, data_ptr_regs, vector_size, vec_pool, local_gpr_pool);
|
||||
} else if (outer_work_amount > 1) {
|
||||
// We need to create a Loop in this case
|
||||
h->mov(reg_outer_amount, outer_work_amount);
|
||||
h->L(for_body);
|
||||
{
|
||||
emit_tiles(reg_inner_amount, data_ptr_regs, vector_size, vec_pool, local_gpr_pool);
|
||||
|
||||
// Todo: Load and Store emitters are currently implemented so they ALWAYS increment appropriate pointers
|
||||
// after reading/writing. This might be a problem if we need to read the same data multiple times (broadcasting shapes).
|
||||
// To overcome this limitation, we add appropriate negative offsets if necessary.
|
||||
for (auto i = 0; i < num_params; i++) {
|
||||
if (jcp.scheduler_offsets[i] != 0) {
|
||||
h->add(data_ptr_regs[i], jcp.scheduler_offsets[i]);
|
||||
}
|
||||
}
|
||||
// Note that outer dimensions are always incremented by 1 (outer tiles are always scalar)
|
||||
h->sub(reg_outer_amount, 1);
|
||||
h->cmp(reg_outer_amount, 1);
|
||||
h->jge(for_body, CodeGenerator::T_NEAR);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<AllocatedEmitter>& TileEmitter::get_nested_code() {
|
||||
return body;
|
||||
}
|
||||
|
||||
TileEmitter::TileEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
|
||||
const std::shared_ptr<ov::Node>& n) : jit_container_emitter(h, isa, n) {
|
||||
const auto tile = ov::as_type_ptr<ngraph::snippets::op::Tile>(n);
|
||||
if (!tile)
|
||||
IE_THROW() << "TileEmitter invoked with invalid op argument";
|
||||
body = tile->region;
|
||||
if (body.empty())
|
||||
IE_THROW() << "TileEmitter is invoked with empty body";
|
||||
num_inputs = tile->num_inputs;
|
||||
num_outputs = tile->num_outputs;
|
||||
io_dims = tile->io_dims;
|
||||
io_data_size = tile->io_data_size;
|
||||
increment = tile->increment;
|
||||
if (io_dims.size() != num_inputs + num_outputs)
|
||||
IE_THROW() << "TileEmitter constructor got inconsistent arguments. Check num_inputs + num_outputs == io_dims.size()";
|
||||
}
|
||||
|
||||
void TileEmitter::emit_code(const std::vector<size_t> &in,
|
||||
const std::vector<size_t> &out,
|
||||
const std::vector<size_t> &pool,
|
||||
const std::vector<size_t> &gpr) const {
|
||||
validate_arguments(in, out, pool, gpr);
|
||||
emit_impl(in, out, pool, gpr, nullptr);
|
||||
}
|
||||
|
||||
void TileEmitter::validate_arguments(const std::vector<size_t> &in,
|
||||
const std::vector<size_t> &out,
|
||||
const std::vector<size_t> &pool,
|
||||
const std::vector<size_t> &gpr) const {
|
||||
if (in.size() != 1)
|
||||
IE_THROW() << "TileEmitter got invalid number of inputs. Expected 1, got " << in.size();
|
||||
if (out.size() != io_dims.size())
|
||||
IE_THROW() << "TileEmitter got invalid number of outputs. Expected " << io_dims.size() << " , got " << out.size();
|
||||
}
|
||||
|
||||
void TileEmitter::emit_body(const std::vector<size_t>& vec_pool, const std::vector<size_t>& gpr_pool) const {
|
||||
for (auto& code : body)
|
||||
code.first->emit_code(code.second.first, code.second.second, vec_pool, gpr_pool);
|
||||
}
|
||||
|
||||
void TileEmitter::emit_ptr_increments(const std::vector<Reg64>& data_ptr_regs) const {
|
||||
for (size_t i = 0; i < num_inputs + num_outputs; i++) {
|
||||
// those with dims == 1 will be broadcasted, hence don't require increment
|
||||
if (io_dims[i] != 1)
|
||||
h->add(data_ptr_regs[i], increment * io_data_size[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void TileEmitter::emit_impl(const std::vector<size_t>& in,
|
||||
const std::vector<size_t>& out,
|
||||
const std::vector<size_t>& vec_pool,
|
||||
const std::vector<size_t>& gpr_pool,
|
||||
const ov::intel_cpu::emitter_context *emit_context) const {
|
||||
Reg64 work_amount = Reg64(static_cast<int>(in[0]));
|
||||
std::vector<Reg64> data_ptr_regs;
|
||||
transform_idxs_to_regs(out, data_ptr_regs);
|
||||
Label for_body;
|
||||
// Note that:
|
||||
// * Work amount must be set by TileScheduler that executes Tiles
|
||||
// * TileScheduler executes Tile only if it has to perform >= 1 iterations
|
||||
h->L(for_body);
|
||||
emit_body(vec_pool, gpr_pool);
|
||||
emit_ptr_increments(data_ptr_regs);
|
||||
h->sub(work_amount, increment);
|
||||
h->cmp(work_amount, increment);
|
||||
h->jge(for_body, CodeGenerator::T_NEAR);
|
||||
}
|
||||
|
||||
BroadcastMoveEmitter::BroadcastMoveEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
|
||||
const std::shared_ptr<ov::Node>& n) : jit_emitter(h, isa, n) {
|
||||
if (n->get_input_shape(0).empty())
|
||||
use_broadcast = true;
|
||||
else if (*n->get_input_shape(0).rbegin() != *n->get_output_shape(0).rbegin())
|
||||
use_broadcast = true;
|
||||
else
|
||||
use_broadcast = false;
|
||||
|
||||
if (n->get_input_element_type(0) != n->get_output_element_type(0))
|
||||
IE_THROW() << "BroadcastMoveEmitter supports only equal input and output types but gets: "
|
||||
<< n->get_input_element_type(0) << " and " << n->get_output_element_type(0);
|
||||
byte_size = n->get_input_element_type(0).size();
|
||||
}
|
||||
|
||||
void BroadcastMoveEmitter::emit_impl(const std::vector<size_t>& in,
|
||||
const std::vector<size_t>& out,
|
||||
const std::vector<size_t>& pool,
|
||||
const std::vector<size_t>& gpr,
|
||||
const ov::intel_cpu::emitter_context *emit_context) const {
|
||||
if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
|
||||
emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
|
||||
} else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
|
||||
emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
|
||||
} else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
|
||||
emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
|
||||
} else {
|
||||
IE_THROW() << "BroadcastMove emitter doesn't support " << host_isa_;
|
||||
}
|
||||
}
|
||||
|
||||
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
|
||||
void BroadcastMoveEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
|
||||
using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
|
||||
Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
|
||||
Vmm vmm_src0 = Vmm(in[0]);
|
||||
Xmm xmm_src0 = Xmm(in[0]);
|
||||
Vmm vmm_dst = Vmm(out[0]);
|
||||
|
||||
if (use_broadcast) {
|
||||
switch (byte_size) {
|
||||
case 4: h->uni_vbroadcastss(vmm_dst, xmm_src0); break;
|
||||
case 2: h->vpbroadcastw(vmm_dst, xmm_src0); break;
|
||||
case 1: h->vpbroadcastb(vmm_dst, xmm_src0); break;
|
||||
default: assert(!"unsupported data type");
|
||||
}
|
||||
} else {
|
||||
if (vmm_src0 != vmm_dst)
|
||||
h->uni_vmovups(vmm_dst, vmm_src0);
|
||||
}
|
||||
}
|
||||
|
||||
ScalarEmitter::ScalarEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
|
||||
const std::shared_ptr<ov::Node>& n) : jit_emitter(h, isa, n) {
|
||||
value = dnnl::impl::cpu::x64::float2int(ov::as_type_ptr<ngraph::snippets::op::Scalar>(n)->cast_vector<float>()[0]);
|
||||
push_arg_entry_of("scalar", value, true);
|
||||
prepare_table();
|
||||
}
|
||||
|
||||
void ScalarEmitter::emit_impl(const std::vector<size_t>& in,
|
||||
const std::vector<size_t>& out,
|
||||
const std::vector<size_t>& pool,
|
||||
const std::vector<size_t>& gpr,
|
||||
const ov::intel_cpu::emitter_context *emit_context) const {
|
||||
if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
|
||||
emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
|
||||
} else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
|
||||
emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
|
||||
} else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
|
||||
emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
|
||||
} else {
|
||||
IE_THROW() << "Scalar emitter doesn't support " << host_isa_;
|
||||
}
|
||||
}
|
||||
|
||||
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
|
||||
void ScalarEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
|
||||
using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
|
||||
Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
|
||||
Vmm vmm_dst = Vmm(out[0]);
|
||||
h->uni_vbroadcastss(vmm_dst, table_val("scalar"));
|
||||
}
|
||||
|
||||
|
||||
MemoryEmitter::MemoryEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
|
||||
const std::shared_ptr<ov::Node>& n) : jit_emitter(h, isa, n) {
|
||||
src_prc = InferenceEngine::details::convertPrecision(n->get_input_element_type(0));
|
||||
dst_prc = InferenceEngine::details::convertPrecision(n->get_output_element_type(0));
|
||||
}
|
||||
|
||||
StoreEmitter::StoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
|
||||
const std::shared_ptr<ov::Node>& n) : MemoryEmitter(h, isa, n) {
|
||||
if (src_prc != dst_prc)
|
||||
IE_THROW() << "StoreEmitter supports only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name();
|
||||
|
||||
count = ov::as_type_ptr<ngraph::snippets::op::Store>(n)->get_count();
|
||||
in_out_type_ = emitter_in_out_map::vec_to_gpr;
|
||||
store_emitter.reset(new jit_store_emitter(h, isa, src_prc, dst_prc, count));
|
||||
}
|
||||
|
||||
void StoreEmitter::emit_impl(const std::vector<size_t>& in,
|
||||
const std::vector<size_t>& out,
|
||||
const std::vector<size_t>& pool,
|
||||
const std::vector<size_t>& gpr,
|
||||
const ov::intel_cpu::emitter_context *emit_context) const {
|
||||
if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
|
||||
emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
|
||||
} else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
|
||||
emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
|
||||
} else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
|
||||
emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
|
||||
} else {
|
||||
IE_THROW() << "Store emitter doesn't support " << host_isa_;
|
||||
}
|
||||
}
|
||||
|
||||
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
|
||||
void StoreEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
|
||||
using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
|
||||
Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
|
||||
if (!store_emitter)
|
||||
IE_THROW() << "Store CPU emitter isn't initialized for StoreEmitter!";
|
||||
store_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
|
||||
}
|
||||
|
||||
void StoreEmitter::emit_data() const {
|
||||
store_emitter->emit_data();
|
||||
}
|
||||
|
||||
LoadEmitter::LoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
|
||||
const std::shared_ptr<ov::Node>& n) : MemoryEmitter(h, isa, n) {
|
||||
if (src_prc != dst_prc)
|
||||
IE_THROW() << "LoadEmitter supports only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name();
|
||||
|
||||
count = ov::as_type_ptr<ngraph::snippets::op::Load>(n)->get_count();
|
||||
in_out_type_ = emitter_in_out_map::gpr_to_vec;
|
||||
load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count));
|
||||
}
|
||||
|
||||
void LoadEmitter::emit_impl(const std::vector<size_t>& in,
|
||||
const std::vector<size_t>& out,
|
||||
const std::vector<size_t>& pool,
|
||||
const std::vector<size_t>& gpr,
|
||||
const ov::intel_cpu::emitter_context *emit_context) const {
|
||||
if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
|
||||
emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
|
||||
} else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
|
||||
emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
|
||||
} else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
|
||||
emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
|
||||
} else {
|
||||
IE_THROW() << "Load emitter doesn't support " << host_isa_;
|
||||
}
|
||||
}
|
||||
|
||||
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
|
||||
void LoadEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
|
||||
using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
|
||||
Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
|
||||
if (!load_emitter)
|
||||
IE_THROW() << "Load CPU emitter isn't initialized for LoadEmitter!";
|
||||
load_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
|
||||
}
|
||||
|
||||
void LoadEmitter::emit_data() const {
|
||||
load_emitter->emit_data();
|
||||
}
|
||||
|
||||
BroadcastLoadEmitter::BroadcastLoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
|
||||
const std::shared_ptr<ov::Node>& n) : MemoryEmitter(h, isa, n) {
|
||||
if (src_prc != dst_prc)
|
||||
IE_THROW() << "BroadcastEmitters support only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name();
|
||||
|
||||
in_out_type_ = emitter_in_out_map::gpr_to_vec;
|
||||
}
|
||||
|
||||
void BroadcastLoadEmitter::emit_impl(const std::vector<size_t>& in,
|
||||
const std::vector<size_t>& out,
|
||||
const std::vector<size_t>& pool,
|
||||
const std::vector<size_t>& gpr,
|
||||
const ov::intel_cpu::emitter_context *emit_context) const {
|
||||
if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
|
||||
emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
|
||||
} else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
|
||||
emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
|
||||
} else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
|
||||
emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
|
||||
} else {
|
||||
IE_THROW() << "BroadcastLoad emitter doesn't support " << host_isa_;
|
||||
}
|
||||
}
|
||||
|
||||
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
|
||||
void BroadcastLoadEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
|
||||
using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
|
||||
Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
|
||||
Reg64 in_reg(in[0]);
|
||||
Vmm vmm_dst = Vmm(out[0]);
|
||||
|
||||
// In doesn't really matter if we broadcast or `movss` for vector tails so keep only one version for `BroadcastLoad`,
|
||||
// key point here is not to add post-increment, it might be fixed by some other approach in future
|
||||
switch (src_prc.size()) {
|
||||
case 4: h->uni_vbroadcastss(vmm_dst, h->ptr[in_reg]); break;
|
||||
case 2: h->vpbroadcastw(vmm_dst, h->ptr[in_reg]); break;
|
||||
case 1: h->vpbroadcastb(vmm_dst, h->ptr[in_reg]); break;
|
||||
default: assert(!"unsupported data type");
|
||||
}
|
||||
}
|
||||
|
||||
LoadConvertEmitter::LoadConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
|
||||
: MemoryEmitter(h, isa, n) {
|
||||
count = ov::as_type_ptr<ngraph::snippets::op::Load>(n)->get_count();
|
||||
in_out_type_ = emitter_in_out_map::gpr_to_vec;
|
||||
load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count));
|
||||
}
|
||||
|
||||
void LoadConvertEmitter::emit_impl(const std::vector<size_t>& in,
|
||||
const std::vector<size_t>& out,
|
||||
const std::vector<size_t>& pool,
|
||||
const std::vector<size_t>& gpr,
|
||||
const ov::intel_cpu::emitter_context *emit_context) const {
|
||||
if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
|
||||
emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
|
||||
} else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
|
||||
emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
|
||||
} else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
|
||||
emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
|
||||
} else {
|
||||
IE_THROW() << "LoadConvert emitter doesn't support " << host_isa_;
|
||||
}
|
||||
}
|
||||
|
||||
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
|
||||
void LoadConvertEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
|
||||
if (!load_emitter)
|
||||
IE_THROW() << "Load CPU emitter isn't initialized for LoadEmitter!";
|
||||
load_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
|
||||
}
|
||||
|
||||
void LoadConvertEmitter::emit_data() const {
|
||||
load_emitter->emit_data();
|
||||
}
|
||||
|
||||
StoreConvertEmitter::StoreConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
|
||||
const std::shared_ptr<ov::Node>& n) : MemoryEmitter(h, isa, n) {
|
||||
count = ov::as_type_ptr<ngraph::snippets::op::Store>(n)->get_count();
|
||||
in_out_type_ = emitter_in_out_map::vec_to_gpr;
|
||||
|
||||
if (ov::is_type<ov::intel_cpu::StoreConvertTruncation>(n)) {
|
||||
store_emitter.reset(new jit_store_emitter(h, isa, src_prc, dst_prc, count, arithmetic_mode::truncation));
|
||||
} else if (ov::is_type<ov::intel_cpu::StoreConvertSaturation>(n)) {
|
||||
store_emitter.reset(new jit_store_emitter(h, isa, src_prc, dst_prc, count, arithmetic_mode::saturation));
|
||||
}
|
||||
}
|
||||
|
||||
void StoreConvertEmitter::emit_impl(const std::vector<size_t>& in,
|
||||
const std::vector<size_t>& out,
|
||||
const std::vector<size_t>& pool,
|
||||
const std::vector<size_t>& gpr,
|
||||
const ov::intel_cpu::emitter_context *emit_context) const {
|
||||
if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
|
||||
emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
|
||||
} else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
|
||||
emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
|
||||
} else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
|
||||
emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
|
||||
} else {
|
||||
IE_THROW() << "StoreConvert emitter doesn't support " << host_isa_;
|
||||
}
|
||||
}
|
||||
|
||||
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
|
||||
void StoreConvertEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
|
||||
if (!store_emitter)
|
||||
IE_THROW() << "Store CPU emitter isn't initialized for StoreEmitter!";
|
||||
store_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
|
||||
}
|
||||
|
||||
void StoreConvertEmitter::emit_data() const {
|
||||
store_emitter->emit_data();
|
||||
}
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
@ -6,15 +6,21 @@
|
||||
|
||||
#include <ngraph/rt_info.hpp>
|
||||
#include <ngraph/variant.hpp>
|
||||
#include <ie_ngraph_utils.hpp>
|
||||
|
||||
#include "jit_emitter.hpp"
|
||||
#include "jit_load_store_emitters.hpp"
|
||||
|
||||
#include "snippets_transformations/op/store_convert.hpp"
|
||||
|
||||
using namespace Xbyak;
|
||||
using ngraph::snippets::AllocatedEmitter;
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
#define SNIPPETS_MAX_SNIPPETS_DIMS 7
|
||||
|
||||
#define SNIPPETS_MAX_SNIPPETS_DIMS 12
|
||||
#define SNIPPETS_MAX_HARNESS_DIMS 5
|
||||
#define SNIPPETS_MAX_TILE_RANK 2
|
||||
#define GET_OFF(field) offsetof(jit_snippets_call_args, field)
|
||||
@ -30,11 +36,27 @@ struct jit_snippets_compile_args {
|
||||
std::vector<size_t> output_dims = {};
|
||||
};
|
||||
///
|
||||
/// \brief Kernel is the only entry point to Codogen Jit compilation. Kernel calculates appropriate data offsets,
|
||||
/// and invokes enclosed outer Tiles. Only 2d Tiles are currently supported, so the emitters should
|
||||
/// be organized in the following way:
|
||||
/// KernelEmitter { /* entry point */
|
||||
/// TileEmitter { /* outer tile */
|
||||
/// \brief jit_container_emitter designed to wrap Emitters that contain other Emitters (presently KernelEmitter,
|
||||
/// TileSchedulerEmitter and TileEmitter). This is needed to provide common interface for register mapping
|
||||
/// (abstract to physical) and nested code access.
|
||||
///
|
||||
class jit_container_emitter: public jit_emitter {
|
||||
public:
|
||||
jit_container_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
|
||||
const std::shared_ptr<ov::Node>& n);
|
||||
protected:
|
||||
// maps gpr and vec abstract registers to physical ones. Physical reg indexes are taken from the provided pools
|
||||
// (the first 2 args). All the used gpr and vec registers are also stored in the provided sets (the second 2 args).
|
||||
void map_abstract_registers(const std::vector<size_t>&, const std::vector<size_t>&,
|
||||
std::set<size_t>&, std::set<size_t>&);
|
||||
std::vector<AllocatedEmitter> body;
|
||||
};
|
||||
///
|
||||
/// \brief Kernel is the only entry point to Codogen Jit compilation. Kernel perform abstract-to-physical register
|
||||
/// mapping and creates pools of available gpr and vec registers. Kernel is expected to contain (at least one)
|
||||
/// TileSchedulerEmitter. In general the enclosed emitters should be organized in the following way:
|
||||
/// KernelEmitter { /* entry point, maps registers, creates pools of available registers */
|
||||
/// TileSchedulerEmitter { /* executes required inner, avoids emitting code that won't be executed */
|
||||
/// TileEmitter { /* inner vector tile */
|
||||
/// ... /* All the necessary Load/Strore/elementwise emitters */
|
||||
/// }
|
||||
@ -43,255 +65,110 @@ struct jit_snippets_compile_args {
|
||||
/// }
|
||||
/// }
|
||||
/// }
|
||||
/// Note that Kernel params are passed directly to the emit_code(). The vector of inputs should contain 2 arguments, the
|
||||
/// output vector should be empty. Input parameters
|
||||
/// Note that Kernel doesn't accept any input arguments.
|
||||
///
|
||||
/// \param in[0] The number of the node inputs
|
||||
/// \param in[1] The number of the node outputs
|
||||
///
|
||||
// Todo: Scheduler dims and offsets are currently calculated in Subgraph node and passed to the KernelEmitter.
|
||||
// However, it seems more natural to calculate all the offsets right in the Kernel op, because the calculation is
|
||||
// not device-specific. It is based only on input/output dims (which we already know) and harness num dims
|
||||
// (which we should pass from the plugin). It seems also better to wrap the enclosed emitters in tiles in the Kernel op
|
||||
// and avoid creating empty tiles.
|
||||
class KernelEmitter : public jit_emitter {
|
||||
class KernelEmitter : public jit_container_emitter {
|
||||
public:
|
||||
KernelEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
|
||||
const std::shared_ptr<ov::Node>& n)
|
||||
: jit_emitter(h, isa, n) {
|
||||
const auto kernel = ov::as_type_ptr<ngraph::snippets::op::Kernel>(n);
|
||||
if (!kernel)
|
||||
IE_THROW() << "KernelEmitter invoked with invalid op argument";
|
||||
if (!kernel->compile_params)
|
||||
IE_THROW() << "KernelEmitter invoked without compile_params";
|
||||
code = kernel->region;
|
||||
jcp = *reinterpret_cast<const jit_snippets_compile_args*>(kernel->compile_params);
|
||||
}
|
||||
const std::shared_ptr<ov::Node>& n);
|
||||
|
||||
size_t get_inputs_num() const override {return 0;}
|
||||
|
||||
void emit_code(const std::vector<size_t> &in, const std::vector<size_t> &out,
|
||||
const std::vector<size_t> &pool = {}, const std::vector<size_t> &gpr = {}) const override {
|
||||
validate_arguments(in, out, pool, gpr);
|
||||
emit_impl(in, out, pool, gpr, nullptr);
|
||||
}
|
||||
void emit_code(const std::vector<size_t> &in,
|
||||
const std::vector<size_t> &out,
|
||||
const std::vector<size_t> &pool,
|
||||
const std::vector<size_t> &gpr) const override;
|
||||
|
||||
private:
|
||||
void validate_arguments(const std::vector<size_t> &in, const std::vector<size_t> &out,
|
||||
const std::vector<size_t> &pool = {}, const std::vector<size_t> &gpr = {}) const override {
|
||||
if (in.size() != 2)
|
||||
IE_THROW() << "KernelEmitter got invalid number of inputs. Expected 2, got " << in.size();
|
||||
if (out.size() != 0)
|
||||
IE_THROW() << "KernelEmitter got unexpected output arguments.";
|
||||
const size_t num_params = in[0] + in[1];
|
||||
if (num_params > SNIPPETS_MAX_SNIPPETS_DIMS)
|
||||
IE_THROW() << "KernelEmitter supports only up to " << SNIPPETS_MAX_SNIPPETS_DIMS <<
|
||||
" parameters, got " << num_params;
|
||||
const int64_t harness_num_dims = jcp.output_dims.size() - 1;
|
||||
if (harness_num_dims > SNIPPETS_MAX_HARNESS_DIMS)
|
||||
IE_THROW() << "KernelEmitter supports harness with up to " << SNIPPETS_MAX_HARNESS_DIMS <<
|
||||
" dims, got " << harness_num_dims;
|
||||
}
|
||||
|
||||
void validate_arguments(const std::vector<size_t> &in,
|
||||
const std::vector<size_t> &out,
|
||||
const std::vector<size_t> &pool,
|
||||
const std::vector<size_t> &gpr) const override;
|
||||
void emit_impl(const std::vector<size_t>& in,
|
||||
const std::vector<size_t>& out,
|
||||
const std::vector<size_t>& pool,
|
||||
const std::vector<size_t>& gpr,
|
||||
const ov::intel_cpu::emitter_context *emit_context) const override {
|
||||
const size_t num_inputs = in[0];
|
||||
const size_t num_outputs = in[1];
|
||||
const size_t num_params = num_inputs + num_outputs;
|
||||
int reg64_tmp_start { 8 }; // R8, R9, R10, R11, R12, R13, R14, R15 inputs+outputs+1
|
||||
const int64_t harness_num_dims = jcp.output_dims.size() - 1;
|
||||
|
||||
Reg64 reg_indexes { dnnl::impl::cpu::x64::abi_param_regs[0] };
|
||||
Reg64 reg_const_params { dnnl::impl::cpu::x64::abi_param_regs[1] };
|
||||
Xbyak::Reg64 reg_tmp_64 { dnnl::impl::cpu::x64::abi_not_param_reg };
|
||||
|
||||
h->preamble();
|
||||
|
||||
std::vector<Reg64> regs(num_params);
|
||||
auto init_ptrs_with_offsets = [&](Reg64 pointer, const int64_t *offsets) {
|
||||
for (int j = 0; j < harness_num_dims; j++) {
|
||||
if (jcp.output_dims[j] != 1 && offsets[j] != 0) {
|
||||
h->mov(reg_tmp_64, offsets[j]);
|
||||
h->imul(reg_tmp_64, h->ptr[reg_indexes + j * sizeof(size_t)]);
|
||||
h->add(pointer, reg_tmp_64);
|
||||
}
|
||||
}
|
||||
};
|
||||
for (auto i = 0; i < num_params; i++) {
|
||||
regs[i] = Reg64(reg64_tmp_start + i);
|
||||
if (i < num_inputs)
|
||||
h->mov(regs[i], h->ptr[reg_const_params + GET_OFF(src_ptrs) + i * sizeof(void*)]);
|
||||
else
|
||||
h->mov(regs[i], h->ptr[reg_const_params + GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)]);
|
||||
init_ptrs_with_offsets(regs[i], &jcp.data_offsets[i * harness_num_dims]);
|
||||
}
|
||||
|
||||
for (auto& c : code) {
|
||||
c.first->emit_code(c.second.first, c.second.second, pool, gpr);
|
||||
}
|
||||
|
||||
h->postamble();
|
||||
}
|
||||
const ov::intel_cpu::emitter_context *emit_context) const override;
|
||||
void init_data_pointers(size_t, size_t, const Reg64&, const Reg64&, const std::vector<Reg64>&) const;
|
||||
|
||||
jit_snippets_compile_args jcp;
|
||||
std::vector<std::pair<std::shared_ptr<Emitter>, ngraph::snippets::RegInfo>> code;
|
||||
std::vector<size_t> gp_regs_pool;
|
||||
std::vector<size_t> gp_regs_used;
|
||||
std::vector<size_t> vec_regs_pool;
|
||||
};
|
||||
///
|
||||
/// \brief TileSchedulerEmitter contains Tiles to be executed (presently vector and scalar). It calculates data offsets
|
||||
/// and work amounts, performs data pointer decrements if necessary. It also performs some Tile optimizations: scalar/vector
|
||||
/// tiles are emitted only if necessary; Tile body could be emitted directly, if only one Tile evaluation is required.
|
||||
///
|
||||
/// \param in[0] The number of the node inputs
|
||||
/// \param in[1] The number of the node outputs
|
||||
/// \param in[2] The number of elements that fits into vector register
|
||||
///
|
||||
|
||||
class TileSchedulerEmitter : public jit_container_emitter {
|
||||
public:
|
||||
TileSchedulerEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
|
||||
const std::shared_ptr<ov::Node>& n);
|
||||
|
||||
size_t get_inputs_num() const override {return 0;}
|
||||
void emit_code(const std::vector<size_t> &in,
|
||||
const std::vector<size_t> &out,
|
||||
const std::vector<size_t> &pool,
|
||||
const std::vector<size_t> &gpr) const override;
|
||||
|
||||
private:
|
||||
void validate_arguments(const std::vector<size_t> &in,
|
||||
const std::vector<size_t> &out,
|
||||
const std::vector<size_t> &pool,
|
||||
const std::vector<size_t> &gpr) const override;
|
||||
void emit_impl(const std::vector<size_t>& in,
|
||||
const std::vector<size_t>& out,
|
||||
const std::vector<size_t>& pool,
|
||||
const std::vector<size_t>& gpr,
|
||||
const ov::intel_cpu::emitter_context *emit_context) const override;
|
||||
|
||||
void emit_tiles(const Reg64&, const std::vector<Reg64>&, size_t, const std::vector<size_t>& , const std::vector<size_t>&) const;
|
||||
|
||||
jit_snippets_compile_args jcp;
|
||||
};
|
||||
|
||||
///
|
||||
/// \brief Tile is designed to organize loop over the input and output data. It is essentially a for(...) loop:
|
||||
/// it calculates the total number of iterations, performs operations specified by enclosed emitters, advances iteration counters
|
||||
/// it performs operations specified by enclosed emitters, advances iteration counters
|
||||
/// and breaks when necessary.
|
||||
///
|
||||
/// \param in[0] The number of input entities (or scheduler counts) processed during one iteration of the tile.
|
||||
/// It is expected to be 1 for outer or scalar tiles and vlen for vector tiles.
|
||||
/// \param in[1] Increment of the previous Tile in current dimension. Must be 0 if this is the first Tile.
|
||||
/// So previous_inc is zero for outer and vector tiles (the are the first in dim) and vlen for scalar tiles (they usually go after vector Tiles).
|
||||
/// \param in[2] sum number inputs and number of outputs of the node.
|
||||
/// \param in[3] dimension of the tile. Note that only 2d Tile are currently supported, so dim is 0 for outer tiles, 1 for inner tiles.
|
||||
///
|
||||
// Todo: Inner and outer tiles have different semantics. For example, outer tile always has the increment == 1, and it can contain only
|
||||
// tile emitters (one outer or two inner). So it seems better to create different classes for inner and outer tiles.
|
||||
// Todo: Currently data pointers incremented after each read/write in Load/Store emitters, so we have to decrement them here
|
||||
// if the same data needs to be read twice. Better to move all the pointer increments to TileEmitter and avoid the increments if necessary.
|
||||
class TileEmitter : public jit_emitter {
|
||||
/// It is expected to be 1 for outer or scalar tiles and vlen for vector tiles.
|
||||
class TileEmitter : public jit_container_emitter {
|
||||
public:
|
||||
TileEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
|
||||
const std::shared_ptr<ov::Node>& n)
|
||||
: jit_emitter(h, isa, n) {
|
||||
const auto tile = ov::as_type_ptr<ngraph::snippets::op::Tile>(n);
|
||||
if (!tile)
|
||||
IE_THROW() << "TileEmitter invoked with invalid op argument";
|
||||
if (!tile->compile_params)
|
||||
IE_THROW() << "TileEmitter invoked without compile_params";
|
||||
code = tile->region;
|
||||
jcp = *reinterpret_cast<const jit_snippets_compile_args*>(tile->compile_params);
|
||||
}
|
||||
TileEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
|
||||
|
||||
size_t get_inputs_num() const override {return 0;}
|
||||
std::vector<AllocatedEmitter>& get_nested_code();
|
||||
void emit_code(const std::vector<size_t> &in,
|
||||
const std::vector<size_t> &out,
|
||||
const std::vector<size_t> &pool,
|
||||
const std::vector<size_t> &gpr) const override;
|
||||
|
||||
void emit_code(const std::vector<size_t> &in, const std::vector<size_t> &out,
|
||||
const std::vector<size_t> &pool = {}, const std::vector<size_t> &gpr = {}) const override {
|
||||
validate_arguments(in, out, pool, gpr);
|
||||
emit_impl(in, out, pool, gpr, nullptr);
|
||||
}
|
||||
void emit_body(const std::vector<size_t>& vec_pool, const std::vector<size_t>& gpr_pool) const;
|
||||
void emit_ptr_increments(const std::vector<Reg64>& data_ptr_regs) const;
|
||||
|
||||
private:
|
||||
void validate_arguments(const std::vector<size_t> &in, const std::vector<size_t> &out,
|
||||
const std::vector<size_t> &pool = {}, const std::vector<size_t> &gpr = {}) const override {
|
||||
if (in.size() != 4)
|
||||
IE_THROW() << "TileEmitter got invalid number of inputs. Expected 4, got " << in.size();
|
||||
if (out.size() != 0)
|
||||
IE_THROW() << "TileEmitter got unexpected output arguments.";
|
||||
const size_t num_params = in[2];
|
||||
if (num_params > SNIPPETS_MAX_SNIPPETS_DIMS)
|
||||
IE_THROW() << "TileEmitter supports only up to " << SNIPPETS_MAX_SNIPPETS_DIMS <<
|
||||
" parameters, got " << num_params;
|
||||
const size_t dim = in[3];
|
||||
if (dim >= SNIPPETS_MAX_TILE_RANK)
|
||||
IE_THROW() << "TileEmitter supports tile ranks up to " << SNIPPETS_MAX_TILE_RANK <<
|
||||
" got " << dim;
|
||||
}
|
||||
|
||||
void validate_arguments(const std::vector<size_t> &in,
|
||||
const std::vector<size_t> &out,
|
||||
const std::vector<size_t> &pool,
|
||||
const std::vector<size_t> &gpr) const override;
|
||||
void emit_impl(const std::vector<size_t>& in,
|
||||
const std::vector<size_t>& out,
|
||||
const std::vector<size_t>& pool,
|
||||
const std::vector<size_t>& gpr,
|
||||
const ov::intel_cpu::emitter_context *emit_context) const override {
|
||||
const size_t inc = in[0];
|
||||
const size_t previous_inc = in[1]; // increment of a previous tile in the same dim (0 if the first tile in the dim)
|
||||
const size_t num_params = in[2];
|
||||
const size_t dim = in[3]; // tile dimension: 0 - outer, 1 - inner
|
||||
const int reg64_tmp_start { 8 }; // R8, R9, R10, R11, R12, R13, R14, R15 inputs+outputs+1
|
||||
Reg64 amount = Reg64(reg64_tmp_start + num_params); // amount
|
||||
std::array<Label, 2> for_body;
|
||||
const ov::intel_cpu::emitter_context *emit_context) const override;
|
||||
|
||||
// If R15 is not used, reserve it for use in scalar to avoid redundant push-pop's.
|
||||
// todo: Do we need explicitly check that code contains ScalarEmitter?
|
||||
std::vector<size_t> local_gpr = reg64_tmp_start + num_params < 15 ? std::vector<size_t>{15} : std::vector<size_t>{};
|
||||
std::vector<Reg64> regs(num_params);
|
||||
for (auto i = 0; dim == 0 && i < num_params; i++)
|
||||
regs[i] = Reg64(reg64_tmp_start + i);
|
||||
// Loop processing could be simplified in some cases
|
||||
if (inc > jcp.scheduler_dims[dim]) {
|
||||
return;
|
||||
} else if (inc == jcp.scheduler_dims[dim]) {
|
||||
for (auto& c : code) {
|
||||
c.first->emit_code(c.second.first, c.second.second, pool, local_gpr);
|
||||
}
|
||||
} else {
|
||||
// The previous tile has done nothing, all the work is ours
|
||||
if (previous_inc == 0 || previous_inc > jcp.scheduler_dims[dim]) {
|
||||
h->mov(amount, jcp.scheduler_dims[dim]);
|
||||
// The previous tile has done all the work
|
||||
} else if (jcp.scheduler_dims[dim] % previous_inc == 0) {
|
||||
return;
|
||||
}// else: the previous tile has already set a proper work amount
|
||||
h->cmp(amount, inc);
|
||||
h->jl(for_body[0], CodeGenerator::T_NEAR);
|
||||
|
||||
h->L(for_body[1]);
|
||||
{
|
||||
h->push(amount);
|
||||
for (auto& c : code) {
|
||||
c.first->emit_code(c.second.first, c.second.second, pool, local_gpr);
|
||||
}
|
||||
h->pop(amount);
|
||||
// Todo: Load and Store emitters are currently implemented so they ALWAYS increment appropriate pointers
|
||||
// after reading/writing. This might be a problem if we need to read the same data multiple times (broadcasting shapes).
|
||||
// To overcome this limitation, we add appropriate negative offsets if necessary.
|
||||
for (auto i = 0; dim == 0 && i < num_params; i++) {
|
||||
if (jcp.scheduler_offsets[i] != 0) {
|
||||
h->add(regs[i], jcp.scheduler_offsets[i]);
|
||||
}
|
||||
}
|
||||
h->sub(amount, inc);
|
||||
h->cmp(amount, inc);
|
||||
h->jge(for_body[1], CodeGenerator::T_NEAR);
|
||||
}
|
||||
|
||||
h->L(for_body[0]);
|
||||
}
|
||||
}
|
||||
|
||||
// A = <42, 17>
|
||||
// B = < 1, 17>
|
||||
// for (auto k = 0; k < dom_0; k++) { // 42
|
||||
// for (auto n = 0; n < dom_1; n++) { // 17
|
||||
// auto a = *ptr0; ptr0 += vlan; // vector/scalar load
|
||||
// auto b = *ptr1; ptr1 += vlan; // vector/scalar load
|
||||
// }
|
||||
// ptr0 -= 0*dom_1;
|
||||
// ptr1 -= 1*dom_1;
|
||||
// }
|
||||
|
||||
// broadcast by MVD is extra case
|
||||
// A = <42, 17>
|
||||
// B = <42, 1>
|
||||
// for (auto k = 0; k < dom_0; k++) { // 42
|
||||
// for (auto n = 0; n < dom_1; n++) { // 17
|
||||
// auto a = *ptr0; ptr0 += vlan; // vector/scalar load
|
||||
// auto b = *ptr1; // broadcast load
|
||||
// }
|
||||
// ptr0 -= 0*dom_1;
|
||||
// ptr1 += sizeof(ptr1[0]); //ptr1 -= -sizeof(ptr1[0]);
|
||||
// }
|
||||
|
||||
// A = <42, 17, 31>
|
||||
// B = < 1, 17, 31>
|
||||
// for (auto k = 0; k < dom_0; k++) { // 42
|
||||
// for (auto n = 0; n < dom_1; n++) { // 17
|
||||
// for (auto m = 0; m < dom_2; m++) { // 31
|
||||
// auto a = *ptr0; ptr0 += vlan; // vector/scalar load
|
||||
// auto b = *ptr1; ptr1 += vlan; // vector/scalar load
|
||||
// }
|
||||
// }
|
||||
// ptr0 -= 0*dom_1*dom2;
|
||||
// ptr1 -= 1*dom_1*dom2;
|
||||
// }
|
||||
jit_snippets_compile_args jcp;
|
||||
std::vector<std::pair<std::shared_ptr<Emitter>, ngraph::snippets::RegInfo>> code;
|
||||
size_t num_inputs = 0;
|
||||
size_t num_outputs = 0;
|
||||
std::vector<size_t> io_dims {};
|
||||
std::vector<size_t> io_data_size {};
|
||||
size_t increment = 0;
|
||||
};
|
||||
|
||||
class NopEmitter : public jit_emitter {
|
||||
@ -311,17 +188,10 @@ private:
|
||||
}
|
||||
};
|
||||
|
||||
class FakeBroadcastEmitter : public jit_emitter {
|
||||
class BroadcastMoveEmitter : public jit_emitter {
|
||||
public:
|
||||
FakeBroadcastEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
|
||||
: jit_emitter(h, isa, n) {
|
||||
if (n->get_input_shape(0).empty())
|
||||
use_broadcast = true;
|
||||
else if (*n->get_input_shape(0).rbegin() != *n->get_output_shape(0).rbegin())
|
||||
use_broadcast = true;
|
||||
else
|
||||
use_broadcast = false;
|
||||
}
|
||||
BroadcastMoveEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
|
||||
|
||||
size_t get_inputs_num() const override {return 1;}
|
||||
|
||||
private:
|
||||
@ -329,45 +199,19 @@ private:
|
||||
const std::vector<size_t>& out,
|
||||
const std::vector<size_t>& pool,
|
||||
const std::vector<size_t>& gpr,
|
||||
const ov::intel_cpu::emitter_context *emit_context) const override {
|
||||
if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
|
||||
emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
|
||||
} else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
|
||||
emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
|
||||
} else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
|
||||
emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
|
||||
} else {
|
||||
IE_THROW() << host_isa_;
|
||||
assert(!"unsupported isa");
|
||||
}
|
||||
}
|
||||
const ov::intel_cpu::emitter_context *emit_context) const override;
|
||||
|
||||
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
|
||||
void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
|
||||
using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
|
||||
Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
|
||||
Vmm vmm_src0 = Vmm(in[0]);
|
||||
Vmm vmm_dst = Vmm(out[0]);
|
||||
|
||||
if (use_broadcast) {
|
||||
h->uni_vbroadcastss(vmm_dst, Xmm(in[0]));
|
||||
} else {
|
||||
h->uni_vmovups(vmm_dst, vmm_src0);
|
||||
}
|
||||
}
|
||||
void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
|
||||
|
||||
private:
|
||||
bool use_broadcast;
|
||||
size_t byte_size = 0lu;
|
||||
};
|
||||
|
||||
class ScalarEmitter : public jit_emitter {
|
||||
public:
|
||||
ScalarEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
|
||||
: jit_emitter(h, isa, n) {
|
||||
value = dnnl::impl::cpu::x64::float2int(ov::as_type_ptr<ngraph::snippets::op::Scalar>(n)->cast_vector<float>()[0]);
|
||||
push_arg_entry_of("scalar", value, true);
|
||||
prepare_table();
|
||||
}
|
||||
ScalarEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
|
||||
|
||||
size_t get_inputs_num() const override {return 0;}
|
||||
|
||||
@ -379,26 +223,10 @@ private:
|
||||
const std::vector<size_t>& out,
|
||||
const std::vector<size_t>& pool,
|
||||
const std::vector<size_t>& gpr,
|
||||
const ov::intel_cpu::emitter_context *emit_context) const override {
|
||||
if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
|
||||
emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
|
||||
} else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
|
||||
emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
|
||||
} else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
|
||||
emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
|
||||
} else {
|
||||
IE_THROW() << host_isa_;
|
||||
assert(!"unsupported isa");
|
||||
}
|
||||
}
|
||||
const ov::intel_cpu::emitter_context *emit_context) const override;
|
||||
|
||||
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
|
||||
void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
|
||||
using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
|
||||
Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
|
||||
Vmm vmm_dst = Vmm(out[0]);
|
||||
h->uni_vbroadcastss(vmm_dst, table_val("scalar"));
|
||||
}
|
||||
void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
|
||||
|
||||
private:
|
||||
int32_t value;
|
||||
@ -415,33 +243,16 @@ private:
|
||||
/// Blocked parameter to tell if input is actually blocked. Broadcast means broadcast by W in other cases no need to substitute load.
|
||||
class MemoryEmitter : public jit_emitter {
|
||||
public:
|
||||
MemoryEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
|
||||
: jit_emitter(h, isa, n), ea(getEA(n)) {
|
||||
}
|
||||
|
||||
size_t get_inputs_num() const override {return 1;}
|
||||
MemoryEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
|
||||
|
||||
protected:
|
||||
static auto getEA(const std::shared_ptr<ov::Node>& n) -> size_t {
|
||||
auto& rt = n->get_rt_info();
|
||||
size_t ea = 0;
|
||||
auto it = rt.find("effectiveAddress");
|
||||
if (it != rt.end()) {
|
||||
ea = it->second.as<int64_t>();
|
||||
} else {
|
||||
throw ov::Exception("effective address for Load generation cannot be determined");
|
||||
}
|
||||
return ea;
|
||||
}
|
||||
|
||||
size_t ea;
|
||||
Precision src_prc;
|
||||
Precision dst_prc;
|
||||
};
|
||||
|
||||
class StoreEmitter : public MemoryEmitter {
|
||||
public:
|
||||
StoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
|
||||
: MemoryEmitter(h, isa, n) {
|
||||
}
|
||||
StoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
|
||||
|
||||
size_t get_inputs_num() const override {return 1;}
|
||||
|
||||
@ -450,72 +261,20 @@ private:
|
||||
const std::vector<size_t>& out,
|
||||
const std::vector<size_t>& pool,
|
||||
const std::vector<size_t>& gpr,
|
||||
const ov::intel_cpu::emitter_context *emit_context) const override {
|
||||
if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
|
||||
emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
|
||||
} else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
|
||||
emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
|
||||
} else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
|
||||
emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
|
||||
} else {
|
||||
IE_THROW() << host_isa_;
|
||||
assert(!"unsupported isa");
|
||||
}
|
||||
}
|
||||
const ov::intel_cpu::emitter_context *emit_context) const override;
|
||||
|
||||
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
|
||||
void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
|
||||
using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
|
||||
Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
|
||||
Reg64 out_reg(ea);
|
||||
Vmm vmm_src0 = Vmm(in[0]);
|
||||
h->uni_vmovups(h->ptr[out_reg], vmm_src0);
|
||||
h->add(out_reg, dnnl::impl::cpu::x64::cpu_isa_traits<isa>::vlen);
|
||||
}
|
||||
};
|
||||
|
||||
class ScalarStoreEmitter : public MemoryEmitter {
|
||||
public:
|
||||
ScalarStoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
|
||||
: MemoryEmitter(h, isa, n) {
|
||||
}
|
||||
|
||||
size_t get_inputs_num() const override {return 1;}
|
||||
void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
|
||||
void emit_data() const override;
|
||||
|
||||
private:
|
||||
void emit_impl(const std::vector<size_t>& in,
|
||||
const std::vector<size_t>& out,
|
||||
const std::vector<size_t>& pool,
|
||||
const std::vector<size_t>& gpr,
|
||||
const ov::intel_cpu::emitter_context *emit_context) const override {
|
||||
if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
|
||||
emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
|
||||
} else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
|
||||
emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
|
||||
} else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
|
||||
emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
|
||||
} else {
|
||||
IE_THROW() << host_isa_;
|
||||
assert(!"unsupported isa");
|
||||
}
|
||||
}
|
||||
|
||||
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
|
||||
void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
|
||||
using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
|
||||
Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
|
||||
Reg64 out_reg(ea);
|
||||
Xmm vmm_src0 = Xmm(in[0]);
|
||||
h->uni_vmovss(h->ptr[out_reg], vmm_src0);
|
||||
h->add(out_reg, sizeof(float));
|
||||
}
|
||||
size_t count;
|
||||
std::unique_ptr<jit_store_emitter> store_emitter = nullptr;
|
||||
};
|
||||
|
||||
class LoadEmitter : public MemoryEmitter {
|
||||
public:
|
||||
LoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
|
||||
: MemoryEmitter(h, isa, n), shouldPostIncrement(*n->get_input_shape(0).rbegin() != 1) {
|
||||
}
|
||||
LoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
|
||||
|
||||
size_t get_inputs_num() const override {return 0;}
|
||||
|
||||
@ -524,41 +283,21 @@ private:
|
||||
const std::vector<size_t>& out,
|
||||
const std::vector<size_t>& pool,
|
||||
const std::vector<size_t>& gpr,
|
||||
const ov::intel_cpu::emitter_context *emit_context) const override {
|
||||
if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
|
||||
emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
|
||||
} else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
|
||||
emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
|
||||
} else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
|
||||
emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
|
||||
} else {
|
||||
IE_THROW() << host_isa_;
|
||||
assert(!"unsupported isa");
|
||||
}
|
||||
}
|
||||
const ov::intel_cpu::emitter_context *emit_context) const override;
|
||||
|
||||
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
|
||||
void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
|
||||
using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
|
||||
Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
|
||||
Reg64 in_reg(ea);
|
||||
Vmm vmm_src0 = Vmm(out[0]);
|
||||
h->uni_vmovups(vmm_src0, h->ptr[in_reg]);
|
||||
|
||||
if (shouldPostIncrement) {
|
||||
h->add(in_reg, dnnl::impl::cpu::x64::cpu_isa_traits<isa>::vlen);
|
||||
}
|
||||
}
|
||||
void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
|
||||
void emit_data() const override;
|
||||
|
||||
private:
|
||||
bool shouldPostIncrement;
|
||||
size_t count;
|
||||
std::unique_ptr<jit_load_emitter> load_emitter = nullptr;
|
||||
};
|
||||
|
||||
class BroadcastLoadEmitter : public MemoryEmitter {
|
||||
public:
|
||||
BroadcastLoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
|
||||
: MemoryEmitter(h, isa, n) {
|
||||
}
|
||||
BroadcastLoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
|
||||
|
||||
size_t get_inputs_num() const override {return 0;}
|
||||
|
||||
private:
|
||||
@ -566,73 +305,54 @@ private:
|
||||
const std::vector<size_t>& out,
|
||||
const std::vector<size_t>& pool,
|
||||
const std::vector<size_t>& gpr,
|
||||
const ov::intel_cpu::emitter_context *emit_context) const override {
|
||||
if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
|
||||
emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
|
||||
} else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
|
||||
emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
|
||||
} else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
|
||||
emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
|
||||
} else {
|
||||
IE_THROW() << host_isa_;
|
||||
assert(!"unsupported isa");
|
||||
}
|
||||
}
|
||||
const ov::intel_cpu::emitter_context *emit_context) const override;
|
||||
|
||||
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
|
||||
void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
|
||||
using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
|
||||
Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
|
||||
Reg64 in_reg(ea);
|
||||
Vmm vmm_src0 = Vmm(out[0]);
|
||||
|
||||
// In doesn't really matter if we broadcast or `movss` for vector tails so keep only one version for `BroadcastLoad`,
|
||||
// key point here is not to add post-increment, it might be fixed by some other approach in future
|
||||
h->uni_vbroadcastss(vmm_src0, h->ptr[in_reg]);
|
||||
}
|
||||
void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
|
||||
};
|
||||
|
||||
class ScalarLoadEmitter : public MemoryEmitter {
|
||||
class LoadConvertEmitter : public MemoryEmitter {
|
||||
public:
|
||||
ScalarLoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
|
||||
: MemoryEmitter(h, isa, n), shouldPostIncrement(*n->get_input_shape(0).rbegin() != 1) {
|
||||
}
|
||||
LoadConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
|
||||
|
||||
size_t get_inputs_num() const override {return 0;}
|
||||
|
||||
private:
|
||||
void emit_impl(const std::vector<size_t>& in,
|
||||
const std::vector<size_t>& out,
|
||||
const std::vector<size_t>& pool,
|
||||
const std::vector<size_t>& gpr,
|
||||
const ov::intel_cpu::emitter_context *emit_context) const override {
|
||||
if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
|
||||
emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
|
||||
} else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
|
||||
emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
|
||||
} else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
|
||||
emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
|
||||
} else {
|
||||
IE_THROW() << host_isa_;
|
||||
assert(!"unsupported isa");
|
||||
}
|
||||
}
|
||||
const std::vector<size_t>& out,
|
||||
const std::vector<size_t>& pool,
|
||||
const std::vector<size_t>& gpr,
|
||||
const ov::intel_cpu::emitter_context *emit_context) const override;
|
||||
|
||||
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
|
||||
void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
|
||||
using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
|
||||
Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
|
||||
Reg64 in_reg(ea);
|
||||
Xmm vmm_src0 = Xmm(out[0]);
|
||||
h->uni_vmovss(vmm_src0, h->ptr[in_reg]);
|
||||
|
||||
// Doesn't work if the same pointer comes with multiple load operations
|
||||
if (shouldPostIncrement) {
|
||||
h->add(in_reg, sizeof(float));
|
||||
}
|
||||
}
|
||||
void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
|
||||
void emit_data() const override;
|
||||
|
||||
private:
|
||||
bool shouldPostIncrement;
|
||||
size_t count;
|
||||
std::unique_ptr<jit_load_emitter> load_emitter = nullptr;
|
||||
};
|
||||
|
||||
class StoreConvertEmitter : public MemoryEmitter {
|
||||
public:
|
||||
StoreConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
|
||||
|
||||
size_t get_inputs_num() const override {return 1;}
|
||||
|
||||
private:
|
||||
void emit_impl(const std::vector<size_t>& in,
|
||||
const std::vector<size_t>& out,
|
||||
const std::vector<size_t>& pool,
|
||||
const std::vector<size_t>& gpr,
|
||||
const ov::intel_cpu::emitter_context *emit_context) const override;
|
||||
|
||||
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
|
||||
void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
|
||||
void emit_data() const override;
|
||||
|
||||
private:
|
||||
size_t count;
|
||||
std::unique_ptr<jit_store_emitter> store_emitter = nullptr;
|
||||
};
|
||||
|
||||
} // namespace intel_cpu
|
||||
|
@ -7,6 +7,8 @@
|
||||
#include "ngraph_transformations/op/leaky_relu.hpp"
|
||||
#include "ngraph_transformations/op/power_static.hpp"
|
||||
#include "ngraph_transformations/op/swish_cpu.hpp"
|
||||
#include "snippets_transformations/op/load_convert.hpp"
|
||||
#include "snippets_transformations/op/store_convert.hpp"
|
||||
|
||||
#include <ngraph/ngraph.hpp>
|
||||
#include <ngraph_ops/augru_cell.hpp>
|
||||
@ -42,6 +44,10 @@ std::map<std::string, ngraph::OpSet> Extension::getOpSets() {
|
||||
NGRAPH_OP(LeakyReluNode, ov::intel_cpu)
|
||||
NGRAPH_OP(PowerStaticNode, ov::intel_cpu)
|
||||
NGRAPH_OP(SwishNode, ov::intel_cpu)
|
||||
NGRAPH_OP(LoadConvertSaturation, ov::intel_cpu)
|
||||
NGRAPH_OP(LoadConvertTruncation, ov::intel_cpu)
|
||||
NGRAPH_OP(StoreConvertSaturation, ov::intel_cpu)
|
||||
NGRAPH_OP(StoreConvertTruncation, ov::intel_cpu)
|
||||
#undef NGRAPH_OP
|
||||
|
||||
return opset;
|
||||
|
@ -180,6 +180,39 @@ bool isSuitableMatMulParent(const std::shared_ptr<const Node> &node) {
|
||||
const bool has_only_child = (out.size() == 1) && (out[0].get_target_inputs().size() == 1);
|
||||
return is_suitable_node && has_only_child;
|
||||
}
|
||||
// Subtract as ZeroPoints for Convolution
|
||||
bool isSuitableSubtractAsZeroPointsParent(const std::shared_ptr<const Node> &node) {
|
||||
const bool is_suitable_node = ov::is_type<ngraph::op::v1::Subtract>(node);
|
||||
const auto out = node->outputs();
|
||||
const bool has_only_child = (out.size() == 1) && (out[0].get_target_inputs().size() == 1);
|
||||
const bool has_two_parents = node->get_input_size() == 2;
|
||||
if (!(is_suitable_node && has_only_child && has_two_parents))
|
||||
return false;
|
||||
|
||||
const auto child = node->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
|
||||
const bool is_conv = ov::is_type<ov::op::v1::Convolution>(child);
|
||||
const bool is_group_conv = ov::is_type<ov::op::v1::GroupConvolution>(child);
|
||||
if (!is_conv && !is_group_conv)
|
||||
return false;
|
||||
const auto weight_shape = child->get_input_shape(1);
|
||||
const bool is_depthwise = is_group_conv && weight_shape[1] == 1 && weight_shape[2] == 1;
|
||||
const bool deptwise_is_suitable = implication(is_depthwise, child->get_input_shape(0).size() < 5);
|
||||
if (!(is_conv && deptwise_is_suitable))
|
||||
return false;
|
||||
|
||||
const bool first_input_is_suitable = node->get_input_node_shared_ptr(0)->get_output_element_type(0) == ov::element::u8;
|
||||
const auto zp_weights = node->get_input_node_shared_ptr(1);
|
||||
const auto zp_weight_shape = zp_weights->get_output_shape(0);
|
||||
bool second_input_is_suitable =
|
||||
ov::is_type<ngraph::op::v0::Constant>(zp_weights) &&
|
||||
zp_weights->get_output_element_type(0) == ov::element::u8 &&
|
||||
zp_weight_shape.size() >= 2;
|
||||
if (!(first_input_is_suitable && second_input_is_suitable))
|
||||
return false;
|
||||
auto correct_shape = ov::Shape(zp_weight_shape.size(), 1);
|
||||
correct_shape[1] = zp_weight_shape[1];
|
||||
return correct_shape == zp_weight_shape;
|
||||
}
|
||||
bool isSuitablePoolChild(const std::shared_ptr<const Node> &node) {
|
||||
const bool is_suitable_node = ov::is_type<ngraph::op::v1::MaxPool>(node);
|
||||
// has a single output, connected to a single child
|
||||
@ -225,15 +258,40 @@ bool isSuitableChildForFusingMatMul(const std::shared_ptr<const Node> &node, Nod
|
||||
// FuseMatMulAndSimpleOperation or FuseFullyConnectedAndSimpleOperation
|
||||
// Invoke SupportsFusingWithConvolution_Simple directly instead of isSuitableChildForFusingSimple to
|
||||
// eliminate getNumNonConstInputs() check
|
||||
int fusingAxis;
|
||||
if (can_be_converted_to_FC)
|
||||
fusingAxis = matmul_shape.size() == 3 ? 2 : 1;
|
||||
else
|
||||
fusingAxis = matmul_shape.size() - 1;
|
||||
int fusingAxis = can_be_converted_to_FC ? (matmul_shape.size() == 3 ? 2 : 1) : matmul_shape.size() - 1;
|
||||
|
||||
if (SupportsFusingWithConvolution_Simple(node, fusingAxis)) {
|
||||
updatedChainType = NodeFusingType::FusedWithMisc;
|
||||
return true;
|
||||
}
|
||||
|
||||
// canFuse() from MatMul for case with rank > 2
|
||||
// Algorithm::EltwisePowerStatic is ignored
|
||||
if (!can_be_converted_to_FC &&
|
||||
node->get_output_shape(0).size() > 2) {
|
||||
if (ov::is_type<ov::op::v1::Add>(node) ||
|
||||
ov::is_type<ov::op::v1::Multiply>(node) ||
|
||||
ov::is_type<ov::op::v1::Subtract>(node) ||
|
||||
ov::is_type<ov::op::v1::Divide>(node) ||
|
||||
ov::is_type<ov::op::v0::PRelu>(node)) {
|
||||
const auto const1 = ov::is_type<ov::op::v0::Constant>(node->get_input_node_shared_ptr(0));
|
||||
const auto const2 = ov::is_type<ov::op::v0::Constant>(node->get_input_node_shared_ptr(1));
|
||||
int constPort = -1;
|
||||
if (const2) {
|
||||
constPort = 1;
|
||||
} else if (const1) {
|
||||
constPort = 0;
|
||||
}
|
||||
|
||||
if (constPort != -1) {
|
||||
auto const_shape = node->get_input_shape(constPort);
|
||||
if (ov::shape_size(const_shape) != 1) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// FullyConnectedBiasFusion
|
||||
if (!(can_be_converted_to_FC && ov::is_type<ngraph::opset1::Add>(node) &&
|
||||
bias_shape.back() == matmul_shape.back() &&
|
||||
@ -340,6 +398,9 @@ bool SnippetsMarkSkipped::run_on_model(const std::shared_ptr<ov::Model> &m) {
|
||||
} else if (isSuitableMatMulParent(node)) {
|
||||
SetNodeFusingType(node, NodeFusingType::FusedWithMatMul);
|
||||
continue;
|
||||
} else if (isSuitableSubtractAsZeroPointsParent(node)) {
|
||||
SetSnippetsNodeType(node, snippets::pass::SnippetsNodeType::SkippedByPlugin);
|
||||
continue;
|
||||
}
|
||||
for (const auto fusingChainType : getContinuableChains(node)) {
|
||||
if (isSuitableChildForFusingSimple(node, channelAxis)) {
|
||||
|
@ -22,6 +22,7 @@
|
||||
|
||||
#include <snippets/op/subgraph.hpp>
|
||||
#include "emitters/cpu_generator.hpp"
|
||||
#include "snippets_transformations/fuse_load_store_and_convert.hpp"
|
||||
|
||||
using namespace InferenceEngine;
|
||||
using namespace dnnl::impl::utils;
|
||||
@ -60,7 +61,7 @@ void Snippet::initSupportedPrimitiveDescriptors() {
|
||||
if (!supportedPrimitiveDescriptors.empty())
|
||||
return;
|
||||
|
||||
const Precision supportedPrecision = Precision::FP32;
|
||||
const std::set<Precision> supportedPrecisions = { Precision::FP32, Precision::I32, Precision::BF16, Precision::I8, Precision::U8 };
|
||||
|
||||
bool dimRanksAreEqual = true;
|
||||
for (size_t i = 0; dimRanksAreEqual && i < inputShapes.size(); i++) {
|
||||
@ -125,18 +126,29 @@ void Snippet::initSupportedPrimitiveDescriptors() {
|
||||
config.dynBatchSupport = false;
|
||||
config.inConfs.resize(inputShapes.size());
|
||||
for (size_t i = 0; i < inputShapes.size(); i++) {
|
||||
auto precision = getOriginalInputPrecisionAtPort(i);
|
||||
if (supportedPrecisions.count(precision) == 0)
|
||||
IE_THROW() << "Subgraph node with name `" << getName() << "` doesn't support " << precision << " precision.";
|
||||
|
||||
const auto equalPrecisions = getOriginalOutputPrecisions().size() == 1 &&
|
||||
precision == getOriginalOutputPrecisionAtPort(0);
|
||||
|
||||
BlockedMemoryDesc::CmpMask inputMask = BLOCKED_DESC_SKIP_OFFSET_MASK;
|
||||
PortConfig portConfig;
|
||||
portConfig.inPlace((!i && canBeInPlace()) ? 0 : -1);
|
||||
portConfig.inPlace((!i && canBeInPlace() && equalPrecisions) ? 0 : -1);
|
||||
portConfig.constant(false);
|
||||
if (inputShapes[i].getDims()[0] == 1) {
|
||||
inputMask.reset(0); // accepts any stride on batch axis
|
||||
}
|
||||
portConfig.setMemDesc(createMemoryDesc(inputShapes[i], supportedPrecision, offset), inputMask);
|
||||
portConfig.setMemDesc(createMemoryDesc(inputShapes[i], precision, offset), inputMask);
|
||||
config.inConfs[i] = portConfig;
|
||||
}
|
||||
config.outConfs.resize(outputShapes.size());
|
||||
for (size_t i = 0; i < outputShapes.size(); i++) {
|
||||
auto precision = getOriginalOutputPrecisionAtPort(i);
|
||||
if (supportedPrecisions.count(precision) == 0)
|
||||
IE_THROW() << "Subgraph node with name `" << getName() << "` doesn't support " << precision << " precision.";
|
||||
|
||||
BlockedMemoryDesc::CmpMask outputMask = BLOCKED_DESC_SKIP_OFFSET_MASK;
|
||||
PortConfig portConfig;
|
||||
portConfig.inPlace(-1);
|
||||
@ -144,7 +156,7 @@ void Snippet::initSupportedPrimitiveDescriptors() {
|
||||
if (outputShapes[i].getDims()[0] == 1) {
|
||||
outputMask.reset(0); // accepts any stride on batch axis
|
||||
}
|
||||
portConfig.setMemDesc(createMemoryDesc(outputShapes[i], supportedPrecision, offset), outputMask);
|
||||
portConfig.setMemDesc(createMemoryDesc(outputShapes[i], precision, offset), outputMask);
|
||||
config.outConfs[i] = portConfig;
|
||||
}
|
||||
|
||||
@ -203,11 +215,27 @@ bool Snippet::created() const {
|
||||
return getType() == Type::Subgraph;
|
||||
}
|
||||
|
||||
InferenceEngine::Precision Snippet::getRuntimePrecision() const {
|
||||
std::vector<InferenceEngine::Precision> inputPrecisions;
|
||||
for (size_t i = 0; i < getParentEdges().size(); i++) {
|
||||
auto parentEdge = getParentEdgeAt(i);
|
||||
if (parentEdge && parentEdge->getStatus() == Edge::Status::Validated && !parentEdge->getParent()->isConstant()) {
|
||||
inputPrecisions.emplace_back(DnnlExtensionUtils::DataTypeToIEPrecision((parentEdge->getMemoryPtr()->GetDataType())));
|
||||
}
|
||||
}
|
||||
|
||||
return getMaxPrecision(inputPrecisions);
|
||||
}
|
||||
|
||||
bool Snippet::canBeInPlace() const {
|
||||
if (getParentEdgesAtPort(0)[0]->getParent()->getType() == Type::Input) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (getChildEdges().size() != 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (auto& parentEdge : getParentEdges()) {
|
||||
auto parent = parentEdge.lock()->getParent();
|
||||
if (parent->getChildEdges().size() != 1)
|
||||
@ -271,7 +299,9 @@ void Snippet::define_schedule() {
|
||||
ngraph::snippets::op::Subgraph::BlockedShapeVector output_blocked_shapes;
|
||||
for (size_t i = 0; i < outputShapes.size(); i++)
|
||||
output_blocked_shapes.push_back(edgeToBlockedShape(getChildEdgesAtPort(i)[0]));
|
||||
|
||||
exec_domain = snippet->canonicalize(output_blocked_shapes, input_blocked_shapes);
|
||||
|
||||
// initialize by maximum output dimension. Dimensions of outputs should be broadcastable
|
||||
tensorRank = std::max(static_cast<size_t>(rank6D), exec_domain.size());
|
||||
// Canonicalization broadcasts inputs and outputs to max input rank, which can be smaller than tensorRank
|
||||
@ -287,8 +317,7 @@ void Snippet::define_schedule() {
|
||||
}
|
||||
|
||||
const auto config = getSelectedPrimitiveDescriptor()->getConfig();
|
||||
const auto dataSize = config.inConfs[0].getMemDesc()->getPrecision().size();
|
||||
auto initOffsets = [this, config, dataSize]() {
|
||||
auto initOffsets = [this, config]() {
|
||||
// find max rank input among all outputs
|
||||
const size_t inputNum = getParentEdges().size();
|
||||
offsets_in.resize(inputNum);
|
||||
@ -296,7 +325,7 @@ void Snippet::define_schedule() {
|
||||
offsets_in[i].resize(tensorRank, 1);
|
||||
offset_calculation(offsets_in[i], dims_in[i], exec_domain);
|
||||
for (size_t j = 0; j < tensorRank; j++) {
|
||||
offsets_in[i][j] *= dataSize;
|
||||
offsets_in[i][j] *= config.inConfs[i].getMemDesc()->getPrecision().size();
|
||||
}
|
||||
}
|
||||
|
||||
@ -305,7 +334,8 @@ void Snippet::define_schedule() {
|
||||
for (size_t i = 0; i < inputNum; i++) {
|
||||
const auto memPtr = getParentEdgeAt(i)->getMemoryPtr();
|
||||
srcMemPtrs[i] = memPtr;
|
||||
start_offset_in[i] = memPtr->GetDescWithType<BlockedMemoryDesc>()->getOffsetPadding() * dataSize;
|
||||
start_offset_in[i] = memPtr->GetDescWithType<BlockedMemoryDesc>()->getOffsetPadding() *
|
||||
config.inConfs[i].getMemDesc()->getPrecision().size();
|
||||
}
|
||||
|
||||
const size_t outputNum = config.outConfs.size();
|
||||
@ -314,7 +344,7 @@ void Snippet::define_schedule() {
|
||||
offsets_out[i].resize(tensorRank, 1);
|
||||
offset_calculation(offsets_out[i], dims_out[i], exec_domain);
|
||||
for (size_t j = 0; j < tensorRank; j++) {
|
||||
offsets_out[i][j] *= dataSize;
|
||||
offsets_out[i][j] *= config.outConfs[i].getMemDesc()->getPrecision().size();
|
||||
}
|
||||
}
|
||||
|
||||
@ -323,7 +353,8 @@ void Snippet::define_schedule() {
|
||||
for (size_t i = 0; i < outputNum; i++) {
|
||||
const auto memPtr = getChildEdgeAt(i)->getMemoryPtr();
|
||||
dstMemPtrs[i] = memPtr;
|
||||
start_offset_out[i] = memPtr->GetDescWithType<BlockedMemoryDesc>()->getOffsetPadding() * dataSize;
|
||||
start_offset_out[i] = memPtr->GetDescWithType<BlockedMemoryDesc>()->getOffsetPadding() *
|
||||
config.outConfs[i].getMemDesc()->getPrecision().size();
|
||||
}
|
||||
};
|
||||
|
||||
@ -373,7 +404,7 @@ void Snippet::define_schedule() {
|
||||
return collapsedDims;
|
||||
};
|
||||
|
||||
auto initSchedulingInfo = [this, dataSize]() -> void {
|
||||
auto initSchedulingInfo = [this, config]() -> void {
|
||||
// initialize scheduling information
|
||||
sch_offsets_in.resize(offsets_in.size(), 0);
|
||||
sch_offsets_out.resize(offsets_out.size(), 0);
|
||||
@ -385,19 +416,38 @@ void Snippet::define_schedule() {
|
||||
schedulerWorkAmount /= exec_domain[tensorRank - 2];
|
||||
exec_domain[tensorRank - 2] = 1;
|
||||
|
||||
// update offsets for tile 2D because loaders have ptr shifts in some cases and stores have always ptrs shifts
|
||||
// update offsets for tile 2D because loaders and stores have ptr shifts in some cases
|
||||
const int64_t vector_size = snippet->get_generator()->get_target_machine()->get_lanes();
|
||||
for (size_t i = 0; i < offsets_in.size(); i++) {
|
||||
int64_t offset = offsets_in[i][tensorRank - 2];
|
||||
if ((offset > dataSize) || (offset == 0 && dims_in[i].back() != 1)) {
|
||||
sch_offsets_in[i] = offset - exec_domain.back() * dataSize;
|
||||
} else if (offset == dataSize) {
|
||||
const int64_t offset = offsets_in[i][tensorRank - 2];
|
||||
const int64_t data_size = config.inConfs[i].getMemDesc()->getPrecision().size();
|
||||
if (offset == data_size || offset == vector_size * data_size) {
|
||||
sch_offsets_in[i] = offset;
|
||||
} else if ((offset > data_size) || (offset == 0 && dims_in[i].back() != 1 && dims_in[i].back() != vector_size)) {
|
||||
sch_offsets_in[i] = offset - exec_domain.back() * data_size;
|
||||
|
||||
// If scalar tile executes one time, ptr doesn't move on 1 value
|
||||
// so we should absolutelly decrease offset
|
||||
if (exec_domain.back() % vector_size == 1) {
|
||||
sch_offsets_in[i] += data_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < offsets_out.size(); i++) {
|
||||
int64_t offset = offsets_out[i][tensorRank - 2];
|
||||
sch_offsets_out[i] = offset - exec_domain.back() * dataSize;
|
||||
const int64_t offset = offsets_out[i][tensorRank - 2];
|
||||
const size_t data_size = config.outConfs[i].getMemDesc()->getPrecision().size();
|
||||
if (offset == data_size || offset == vector_size * data_size) {
|
||||
sch_offsets_out[i] = offset;
|
||||
} else if ((offset > data_size) || (offset == 0 && dims_out[i].back() != 1 && dims_out[i].back() != vector_size)) {
|
||||
sch_offsets_out[i] = offset - exec_domain.back() * data_size;
|
||||
|
||||
// If scalar tile executes one time, ptr doesn't move on 1 value
|
||||
// so we should absolutelly decrease offset
|
||||
if (exec_domain.back() % vector_size == 1) {
|
||||
sch_offsets_out[i] += data_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
@ -434,7 +484,28 @@ void Snippet::generate() {
|
||||
auto b = offsets_out[i].begin();
|
||||
std::copy(b, b + harness_num_dims, &jcp.data_offsets[(inputShapes.size() + i) * harness_num_dims]);
|
||||
}
|
||||
schedule = snippet->generate(reinterpret_cast<void*>(&jcp));
|
||||
|
||||
ov::pass::Manager optManager;
|
||||
optManager.register_pass<ov::intel_cpu::pass::FuseLoadConvert>();
|
||||
optManager.register_pass<ov::intel_cpu::pass::FuseStoreConvert>();
|
||||
|
||||
// LoadConvert uses Load emitter that support conversion from any type to only f32
|
||||
optManager.get_pass_config()->set_callback<ov::intel_cpu::pass::FuseLoadConvert>(
|
||||
[](const std::shared_ptr<const ov::Node>& n) -> bool {
|
||||
if (const auto& convert = std::dynamic_pointer_cast<const ov::op::v0::Convert>(n))
|
||||
return convert->get_destination_type() != ov::element::f32;
|
||||
return true;
|
||||
});
|
||||
|
||||
// StoreConvert uses Store emitter that support conversion from only f32 to any types
|
||||
optManager.get_pass_config()->set_callback<ov::intel_cpu::pass::FuseStoreConvert>(
|
||||
[](const std::shared_ptr<const ov::Node>& n) -> bool {
|
||||
if (const auto& convert = std::dynamic_pointer_cast<const ov::op::v0::Convert>(n))
|
||||
return convert->get_input_element_type(0) != ov::element::f32;
|
||||
return true;
|
||||
});
|
||||
|
||||
schedule = snippet->generate(optManager, reinterpret_cast<void*>(&jcp));
|
||||
}
|
||||
|
||||
void Snippet::schedule_6d(const jit_snippets_call_args& call_args) const {
|
||||
|
@ -30,6 +30,7 @@ public:
|
||||
void getSupportedDescriptors() override {};
|
||||
void initSupportedPrimitiveDescriptors() override;
|
||||
void selectOptimalPrimitiveDescriptor() override;
|
||||
InferenceEngine::Precision getRuntimePrecision() const override;
|
||||
|
||||
// Here we convert to canonical for & jit everything
|
||||
void createPrimitive() override;
|
||||
|
@ -0,0 +1,117 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/itt.hpp"
|
||||
|
||||
#include "fuse_load_store_and_convert.hpp"
|
||||
#include "snippets/snippets_isa.hpp"
|
||||
|
||||
#include "snippets_transformations/op/load_convert.hpp"
|
||||
#include "snippets_transformations/op/store_convert.hpp"
|
||||
|
||||
#include "ngraph/opsets/opset1.hpp"
|
||||
#include "ngraph/rt_info.hpp"
|
||||
#include "ngraph/pattern/op/wrap_type.hpp"
|
||||
|
||||
ov::intel_cpu::pass::FuseLoadConvert::FuseLoadConvert() {
|
||||
MATCHER_SCOPE(FuseLoadConvert);
|
||||
auto param_pattern = ngraph::pattern::wrap_type<ngraph::opset1::Parameter>();
|
||||
auto load_pattern = ngraph::pattern::wrap_type<ngraph::snippets::op::Load>({param_pattern});
|
||||
auto convert_pattern = ngraph::pattern::wrap_type<ngraph::opset1::Convert>({load_pattern});
|
||||
|
||||
auto callback = [=](ngraph::pattern::Matcher& m) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "ov::intel_cpu::pass::FuseLoadConvert")
|
||||
auto& pm = m.get_pattern_value_map();
|
||||
const auto param = pm.at(param_pattern).get_node_shared_ptr();
|
||||
const auto load_shared = pm.at(load_pattern).get_node_shared_ptr();
|
||||
if (!load_shared || load_shared->output(0).get_target_inputs().size() != 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto load = std::dynamic_pointer_cast<ngraph::snippets::op::Load>(load_shared);
|
||||
if (!load)
|
||||
return false;
|
||||
|
||||
const auto convert = pm.at(convert_pattern).get_node_shared_ptr();
|
||||
if (transformation_callback(convert))
|
||||
return false;
|
||||
|
||||
std::shared_ptr<ngraph::Node> load_convert = nullptr;
|
||||
if (const auto convert_saturation =
|
||||
std::dynamic_pointer_cast<ngraph::snippets::op::ConvertSaturation>(convert)) {
|
||||
load_convert = std::make_shared<ov::intel_cpu::LoadConvertSaturation>(param,
|
||||
convert_saturation->get_destination_type(),
|
||||
load->get_count());
|
||||
} else if (const auto convert_truncation =
|
||||
std::dynamic_pointer_cast<ngraph::snippets::op::ConvertTruncation>(convert)) {
|
||||
load_convert = std::make_shared<ov::intel_cpu::LoadConvertTruncation>(param,
|
||||
convert_truncation->get_destination_type(),
|
||||
load->get_count());
|
||||
} else {
|
||||
throw ngraph::ngraph_error(
|
||||
"Type of Convert op is undefined. Supports only fusing Load and ConvertTruncation or ConvertSaturation ops");
|
||||
}
|
||||
|
||||
if (!load_convert)
|
||||
return false;
|
||||
|
||||
ngraph::copy_runtime_info(convert, load_convert);
|
||||
ngraph::replace_node(convert, load_convert);
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
auto m = std::make_shared<ngraph::pattern::Matcher>(convert_pattern, matcher_name);
|
||||
register_matcher(m, callback);
|
||||
}
|
||||
|
||||
|
||||
ov::intel_cpu::pass::FuseStoreConvert::FuseStoreConvert() {
|
||||
MATCHER_SCOPE(FuseStoreConvert);
|
||||
auto input_pattern = ngraph::pattern::any_input();
|
||||
auto convert_pattern = ngraph::pattern::wrap_type<ngraph::opset1::Convert>({input_pattern});
|
||||
auto store_pattern = ngraph::pattern::wrap_type<ngraph::snippets::op::Store>({convert_pattern});
|
||||
|
||||
auto callback = [=](ngraph::pattern::Matcher& m) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "ov::intel_cpu::pass::FuseStoreConvert")
|
||||
auto& pm = m.get_pattern_value_map();
|
||||
const auto input = pm.at(input_pattern).get_node_shared_ptr();
|
||||
|
||||
const auto store = std::dynamic_pointer_cast<ngraph::snippets::op::Store>(pm.at(store_pattern).get_node_shared_ptr());
|
||||
if (!store)
|
||||
return false;
|
||||
|
||||
const auto convert = pm.at(convert_pattern).get_node_shared_ptr();
|
||||
if (convert->output(0).get_target_inputs().size() != 1 || transformation_callback(convert))
|
||||
return false;
|
||||
|
||||
std::shared_ptr<ngraph::Node> store_convert = nullptr;
|
||||
if (const auto convert_saturation =
|
||||
std::dynamic_pointer_cast<ngraph::snippets::op::ConvertSaturation>(convert)) {
|
||||
store_convert = std::make_shared<ov::intel_cpu::StoreConvertSaturation>(input,
|
||||
convert_saturation->get_destination_type(),
|
||||
store->get_count());
|
||||
} else if (const auto convert_truncation =
|
||||
std::dynamic_pointer_cast<ngraph::snippets::op::ConvertTruncation>(convert)) {
|
||||
store_convert = std::make_shared<ov::intel_cpu::StoreConvertTruncation>(input,
|
||||
convert_truncation->get_destination_type(),
|
||||
store->get_count());
|
||||
} else {
|
||||
throw ngraph::ngraph_error(
|
||||
"Type of Convert op is undefined. Supports only fusing Store and ConvertTruncation or ConvertSaturation ops");
|
||||
}
|
||||
|
||||
|
||||
if (!store_convert)
|
||||
return false;
|
||||
|
||||
ngraph::copy_runtime_info(store, store_convert);
|
||||
ngraph::replace_node(store, store_convert);
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
auto m = std::make_shared<ngraph::pattern::Matcher>(store_pattern, matcher_name);
|
||||
register_matcher(m, callback);
|
||||
}
|
@ -0,0 +1,40 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ngraph/pass/graph_rewrite.hpp"
|
||||
#include "ngraph/pattern/matcher.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
namespace pass {
|
||||
|
||||
/**
|
||||
* @interface FuseLoadConvert
|
||||
* @brief Fuse Load and ConvertSaturation into one op LoadConvertSaturation
|
||||
* Fuse Load and ConvertTruncation into one op LoadConvertTruncation
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class FuseLoadConvert: public ngraph::pass::MatcherPass {
|
||||
public:
|
||||
OPENVINO_RTTI("FuseLoadConvert", "0");
|
||||
FuseLoadConvert();
|
||||
};
|
||||
|
||||
/**
|
||||
* @interface FuseStoreConvert
|
||||
* @brief Fuse Store and ConvertSaturation into one op StoreConvertSaturation
|
||||
* Fuse Store and ConvertTruncation into one op StoreConvertTruncation
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class FuseStoreConvert: public ngraph::pass::MatcherPass {
|
||||
public:
|
||||
OPENVINO_RTTI("FuseStoreConvert", "0");
|
||||
FuseStoreConvert();
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
@ -0,0 +1,56 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/itt.hpp"
|
||||
|
||||
#include "load_convert.hpp"
|
||||
|
||||
#include "ngraph/runtime/host_tensor.hpp"
|
||||
|
||||
using namespace std;
|
||||
using namespace ov;
|
||||
|
||||
intel_cpu::LoadConvertSaturation::LoadConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count) :
|
||||
Load(x, count), m_destination_type(destination_type) {
|
||||
constructor_validate_and_infer_types();
|
||||
}
|
||||
|
||||
bool intel_cpu::LoadConvertSaturation::visit_attributes(AttributeVisitor& visitor) {
|
||||
INTERNAL_OP_SCOPE(LoadConvert_visit_attributes);
|
||||
visitor.on_attribute("destination_type", m_destination_type);
|
||||
return true;
|
||||
}
|
||||
|
||||
void intel_cpu::LoadConvertSaturation::validate_and_infer_types() {
|
||||
INTERNAL_OP_SCOPE(LoadConvert_validate_and_infer_types);
|
||||
set_output_type(0, m_destination_type, get_input_partial_shape(0));
|
||||
}
|
||||
|
||||
std::shared_ptr<Node> intel_cpu::LoadConvertSaturation::clone_with_new_inputs(const OutputVector& new_args) const {
|
||||
INTERNAL_OP_SCOPE(LoadConvert_clone_with_new_inputs);
|
||||
check_new_args_count(this, new_args);
|
||||
return std::make_shared<LoadConvertSaturation>(new_args.at(0), m_destination_type, m_count);
|
||||
}
|
||||
|
||||
intel_cpu::LoadConvertTruncation::LoadConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count) :
|
||||
Load(x, count), m_destination_type(destination_type) {
|
||||
constructor_validate_and_infer_types();
|
||||
}
|
||||
|
||||
bool intel_cpu::LoadConvertTruncation::visit_attributes(AttributeVisitor& visitor) {
|
||||
INTERNAL_OP_SCOPE(LoadConvert_visit_attributes);
|
||||
visitor.on_attribute("destination_type", m_destination_type);
|
||||
return true;
|
||||
}
|
||||
|
||||
void intel_cpu::LoadConvertTruncation::validate_and_infer_types() {
|
||||
INTERNAL_OP_SCOPE(LoadConvert_validate_and_infer_types);
|
||||
set_output_type(0, m_destination_type, get_input_partial_shape(0));
|
||||
}
|
||||
|
||||
std::shared_ptr<Node> intel_cpu::LoadConvertTruncation::clone_with_new_inputs(const OutputVector& new_args) const {
|
||||
INTERNAL_OP_SCOPE(LoadConvert_clone_with_new_inputs);
|
||||
check_new_args_count(this, new_args);
|
||||
return std::make_shared<LoadConvertTruncation>(new_args.at(0), m_destination_type, m_count);
|
||||
}
|
@ -0,0 +1,68 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ngraph/op/op.hpp"
|
||||
#include "snippets/op/load.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
/**
|
||||
* @interface LoadConvertSaturation
|
||||
* @brief Fused operation to represent computations equal to consecutive Load and ConvertSaturation operations.
|
||||
* The operation is used for peephole optimization during subgraph lowering.
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class LoadConvertSaturation : public ngraph::snippets::op::Load {
|
||||
public:
|
||||
OPENVINO_OP("LoadConvertSaturation", "SnippetsOpset", ngraph::snippets::op::Load);
|
||||
|
||||
LoadConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count = 1lu);
|
||||
LoadConvertSaturation() = default;
|
||||
|
||||
ov::element::Type get_destination_type() const { return m_destination_type; }
|
||||
|
||||
bool visit_attributes(AttributeVisitor& visitor) override;
|
||||
|
||||
void validate_and_infer_types() override;
|
||||
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
|
||||
|
||||
bool has_evaluate() const override { return false; }
|
||||
|
||||
protected:
|
||||
ov::element::Type m_destination_type;
|
||||
};
|
||||
|
||||
/**
|
||||
* @interface LoadConvertTruncation
|
||||
* @brief Fused operation to represent computations equal to consecutive Load and ConvertTruncation operations.
|
||||
* The operation is used for peephole optimization during subgraph lowering.
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class LoadConvertTruncation : public ngraph::snippets::op::Load {
|
||||
public:
|
||||
OPENVINO_OP("LoadConvertTruncation", "SnippetsOpset", ngraph::snippets::op::Load);
|
||||
|
||||
LoadConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count = 1lu);
|
||||
LoadConvertTruncation() = default;
|
||||
|
||||
ov::element::Type get_destination_type() const { return m_destination_type; }
|
||||
|
||||
bool visit_attributes(AttributeVisitor& visitor) override;
|
||||
|
||||
void validate_and_infer_types() override;
|
||||
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
|
||||
|
||||
bool has_evaluate() const override { return false; }
|
||||
|
||||
protected:
|
||||
ov::element::Type m_destination_type;
|
||||
};
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
@ -0,0 +1,56 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/itt.hpp"
|
||||
|
||||
#include "store_convert.hpp"
|
||||
|
||||
#include "ngraph/runtime/host_tensor.hpp"
|
||||
|
||||
using namespace std;
|
||||
using namespace ov;
|
||||
|
||||
intel_cpu::StoreConvertSaturation::StoreConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count) :
|
||||
Store(x, count), m_destination_type(destination_type) {
|
||||
constructor_validate_and_infer_types();
|
||||
}
|
||||
|
||||
bool intel_cpu::StoreConvertSaturation::visit_attributes(AttributeVisitor& visitor) {
|
||||
INTERNAL_OP_SCOPE(StoreConvert_visit_attributes);
|
||||
visitor.on_attribute("destination_type", m_destination_type);
|
||||
return true;
|
||||
}
|
||||
|
||||
void intel_cpu::StoreConvertSaturation::validate_and_infer_types() {
|
||||
INTERNAL_OP_SCOPE(StoreConvert_validate_and_infer_types);
|
||||
set_output_type(0, m_destination_type, get_input_partial_shape(0));
|
||||
}
|
||||
|
||||
std::shared_ptr<Node> intel_cpu::StoreConvertSaturation::clone_with_new_inputs(const OutputVector& new_args) const {
|
||||
INTERNAL_OP_SCOPE(StoreConvert_clone_with_new_inputs);
|
||||
check_new_args_count(this, new_args);
|
||||
return std::make_shared<StoreConvertSaturation>(new_args.at(0), m_destination_type, m_count);
|
||||
}
|
||||
|
||||
intel_cpu::StoreConvertTruncation::StoreConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count) :
|
||||
Store(x, count), m_destination_type(destination_type) {
|
||||
constructor_validate_and_infer_types();
|
||||
}
|
||||
|
||||
bool intel_cpu::StoreConvertTruncation::visit_attributes(AttributeVisitor& visitor) {
|
||||
INTERNAL_OP_SCOPE(StoreConvert_visit_attributes);
|
||||
visitor.on_attribute("destination_type", m_destination_type);
|
||||
return true;
|
||||
}
|
||||
|
||||
void intel_cpu::StoreConvertTruncation::validate_and_infer_types() {
|
||||
INTERNAL_OP_SCOPE(StoreConvert_validate_and_infer_types);
|
||||
set_output_type(0, m_destination_type, get_input_partial_shape(0));
|
||||
}
|
||||
|
||||
std::shared_ptr<Node> intel_cpu::StoreConvertTruncation::clone_with_new_inputs(const OutputVector& new_args) const {
|
||||
INTERNAL_OP_SCOPE(StoreConvert_clone_with_new_inputs);
|
||||
check_new_args_count(this, new_args);
|
||||
return std::make_shared<StoreConvertTruncation>(new_args.at(0), m_destination_type, m_count);
|
||||
}
|
@ -0,0 +1,68 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ngraph/op/op.hpp"
|
||||
#include "snippets/op/store.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
/**
|
||||
* @interface StoreConvertSaturation
|
||||
* @brief Fused operation to represent computations equal to consecutive Store and ConvertSaturation operations.
|
||||
* The operation is used for peephole optimization during subgraph lowering.
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class StoreConvertSaturation : public ngraph::snippets::op::Store {
|
||||
public:
|
||||
OPENVINO_OP("StoreConvertSaturation", "SnippetsOpset", ngraph::snippets::op::Store);
|
||||
|
||||
StoreConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count = 1lu);
|
||||
StoreConvertSaturation() = default;
|
||||
|
||||
ov::element::Type get_destination_type() const { return m_destination_type; }
|
||||
|
||||
bool visit_attributes(AttributeVisitor& visitor) override;
|
||||
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
|
||||
|
||||
void validate_and_infer_types() override;
|
||||
|
||||
bool has_evaluate() const override { return false; }
|
||||
|
||||
protected:
|
||||
ov::element::Type m_destination_type;
|
||||
};
|
||||
|
||||
/**
|
||||
* @interface StoreConvertTruncation
|
||||
* @brief Fused operation to represent computations equal to consecutive Store and ConvertTruncation operations.
|
||||
* The operation is used for peephole optimization during subgraph lowering.
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class StoreConvertTruncation : public ngraph::snippets::op::Store {
|
||||
public:
|
||||
OPENVINO_OP("StoreConvertTruncation", "SnippetsOpset", ngraph::snippets::op::Store);
|
||||
|
||||
StoreConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count = 1lu);
|
||||
StoreConvertTruncation() = default;
|
||||
|
||||
ov::element::Type get_destination_type() const { return m_destination_type; }
|
||||
|
||||
bool visit_attributes(AttributeVisitor& visitor) override;
|
||||
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
|
||||
|
||||
void validate_and_infer_types() override;
|
||||
|
||||
bool has_evaluate() const override { return false; }
|
||||
|
||||
protected:
|
||||
ov::element::Type m_destination_type;
|
||||
};
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
@ -12,23 +12,31 @@ namespace snippets {
|
||||
|
||||
namespace {
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, Add,
|
||||
::testing::Combine(
|
||||
::testing::Values(ov::Shape {1, 42, 16, 64}),
|
||||
::testing::Values(ov::Shape {1, 42, 16, 1}),
|
||||
::testing::Values(1), // one node - Add
|
||||
::testing::Values(0), // SnippetsMarkSkipped disables tokenization for eltwise chains after inputs
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
Add::getTestCaseName);
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, Add,
|
||||
::testing::Combine(
|
||||
::testing::Values(ov::Shape {1, 42, 16, 64}),
|
||||
::testing::Values(ov::Shape {1, 42, 16, 1}),
|
||||
::testing::Values(1), // one node - Add
|
||||
::testing::Values(0), // SnippetsMarkSkipped disables tokenization for eltwise chains after inputs
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
Add::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, AddSinh,
|
||||
::testing::Combine(
|
||||
::testing::Values(ov::Shape {1, 42, 16, 64}),
|
||||
::testing::Values(ov::Shape {1, 42, 16, 1}),
|
||||
::testing::Values(3), // Add + 2 converts after inputs
|
||||
::testing::Values(1), // Subgraph is created, since the inputs are followed by converts
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
AddSinh::getTestCaseName);
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, AddSinh,
|
||||
::testing::Combine(
|
||||
::testing::Values(ov::Shape {1, 42, 16, 64}),
|
||||
::testing::Values(ov::Shape {1, 42, 16, 1}),
|
||||
::testing::Values(3), // Add + 2 converts after inputs
|
||||
::testing::Values(1), // Subgraph is created, since the inputs are followed by converts
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
AddSinh::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, AddSinhConst,
|
||||
::testing::Combine(
|
||||
::testing::Values(ov::Shape {1, 42, 16, 64}),
|
||||
::testing::Values(2), // Add + 2 converts after inputs
|
||||
::testing::Values(1), // Subgraph is created, since the inputs are followed by converts
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
AddSinhConst::getTestCaseName);
|
||||
|
||||
} // namespace
|
||||
} // namespace snippets
|
||||
|
@ -0,0 +1,162 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/convert.hpp"
|
||||
#include "common_test_utils/test_constants.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace test {
|
||||
namespace snippets {
|
||||
|
||||
|
||||
namespace {
|
||||
|
||||
const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>>> types_Convert = {
|
||||
{ { ov::element::f32 }, { ov::element::i32 } },
|
||||
{ { ov::element::f32 }, { ov::element::bf16 } },
|
||||
{ { ov::element::f32 }, { ov::element::u8 } },
|
||||
{ { ov::element::f32 }, { ov::element::i8 } },
|
||||
|
||||
{ { ov::element::bf16 }, { ov::element::f32 } },
|
||||
{ { ov::element::bf16 }, { ov::element::i32 } },
|
||||
{ { ov::element::bf16 }, { ov::element::i8 } },
|
||||
{ { ov::element::bf16 }, { ov::element::u8 } },
|
||||
|
||||
{ { ov::element::i8 }, { ov::element::f32 } },
|
||||
{ { ov::element::i8 }, { ov::element::i32 } },
|
||||
{ { ov::element::i8 }, { ov::element::bf16 } },
|
||||
{ { ov::element::i8 }, { ov::element::u8 } },
|
||||
|
||||
{ { ov::element::u8 }, { ov::element::f32 } },
|
||||
{ { ov::element::u8 }, { ov::element::i32 } },
|
||||
{ { ov::element::u8 }, { ov::element::bf16 } },
|
||||
{ { ov::element::u8 }, { ov::element::i8 } },
|
||||
};
|
||||
|
||||
const std::vector<std::vector<ov::Shape>> inputShapes_Convert = {
|
||||
{ ov::Shape{2, 16} },
|
||||
{ ov::Shape{5, 5} },
|
||||
{ ov::Shape{2, 12, 1} }
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Convert, Convert,
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(inputShapes_Convert),
|
||||
::testing::ValuesIn(types_Convert),
|
||||
::testing::Values(2),
|
||||
::testing::Values(1),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
Convert::getTestCaseName);
|
||||
|
||||
const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>>> types_ConvertInput = {
|
||||
{ { ov::element::f32 }, { ov::element::i32 } },
|
||||
{ { ov::element::f32 }, { ov::element::bf16 } },
|
||||
|
||||
{ { ov::element::bf16 }, { ov::element::f32 } },
|
||||
|
||||
{ { ov::element::i8 }, { ov::element::f32 } },
|
||||
{ { ov::element::i8 }, { ov::element::i32 } },
|
||||
{ { ov::element::i8 }, { ov::element::bf16 } },
|
||||
|
||||
{ { ov::element::u8 }, { ov::element::f32 } },
|
||||
{ { ov::element::u8 }, { ov::element::i32 } },
|
||||
{ { ov::element::u8 }, { ov::element::bf16 } },
|
||||
};
|
||||
|
||||
const std::vector<std::vector<ov::Shape>> inputShapes_ConvertInput = {
|
||||
{ ov::Shape{2, 16}, ov::Shape{1, 16} },
|
||||
{ ov::Shape{5, 18}, ov::Shape{5, 1} },
|
||||
{ ov::Shape{3, 1}, ov::Shape{3, 21} }
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertInput, ConvertInput,
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(inputShapes_ConvertInput),
|
||||
::testing::ValuesIn(types_ConvertInput),
|
||||
::testing::Values(3),
|
||||
::testing::Values(1),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
Convert::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertOutput, ConvertOutput,
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(inputShapes_ConvertInput),
|
||||
::testing::ValuesIn(types_ConvertInput),
|
||||
::testing::Values(3),
|
||||
::testing::Values(1),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
Convert::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertStub, ConvertStub,
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(inputShapes_ConvertInput),
|
||||
::testing::ValuesIn(types_ConvertInput),
|
||||
::testing::Values(4),
|
||||
::testing::Values(2),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
Convert::getTestCaseName);
|
||||
|
||||
const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>>> types_ConvertPartialInputsAndResults = {
|
||||
{ { ov::element::i8, ov::element::i32, ov::element::f32 }, { ov::element::f32, ov::element::i8 } },
|
||||
{ { ov::element::bf16, ov::element::u8, ov::element::i32 }, { ov::element::i32, ov::element::bf16 } },
|
||||
};
|
||||
|
||||
const std::vector<std::vector<ov::Shape>> inputShapes_ConvertPartialInputsAndResults = {
|
||||
{ ov::Shape{2, 16}, ov::Shape{1, 16}, ov::Shape{1, 1} },
|
||||
{ ov::Shape{5, 18}, ov::Shape{5, 1}, ov::Shape{1, 18} },
|
||||
{ ov::Shape{3, 1}, ov::Shape{3, 21}, ov::Shape{3, 1} }
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertPartialInputsAndResults, ConvertPartialInputsAndResults,
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(inputShapes_ConvertPartialInputsAndResults),
|
||||
::testing::ValuesIn(types_ConvertPartialInputsAndResults),
|
||||
::testing::Values(6),
|
||||
::testing::Values(1),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
Convert::getTestCaseName);
|
||||
|
||||
const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>>> types_ConvertMany = {
|
||||
{ { ov::element::i32, ov::element::u8}, {} },
|
||||
{ { ov::element::i32, ov::element::u8, ov::element::i32 }, {} },
|
||||
{ { ov::element::i32, ov::element::f32, ov::element::i32, ov::element::i8 }, {} },
|
||||
{ { ov::element::i32, ov::element::i8, ov::element::i32, ov::element::f32 }, {} },
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertManyOnInputs, ConvertManyOnInputs,
|
||||
::testing::Combine(
|
||||
::testing::Values(std::vector<ov::Shape>{ov::Shape{5, 5, 5, 5}}),
|
||||
::testing::ValuesIn(types_ConvertMany),
|
||||
::testing::Values(2),
|
||||
::testing::Values(1),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
Convert::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertManyOnOutputs, ConvertManyOnOutputs,
|
||||
::testing::Combine(
|
||||
::testing::Values(std::vector<ov::Shape>{ov::Shape{5, 5, 5, 5}}),
|
||||
::testing::ValuesIn(types_ConvertMany),
|
||||
::testing::Values(5), // sinh + subgraph + reorders for sinh
|
||||
::testing::Values(1),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
Convert::getTestCaseName);
|
||||
|
||||
const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>>> types_ConvertManyIO = {
|
||||
{ { ov::element::i32, ov::element::u8}, {ov::element::i32} },
|
||||
{ { ov::element::i32, ov::element::u8, ov::element::i32 }, { ov::element::i32, ov::element::i8, ov::element::i32, ov::element::f32 } },
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertManyOnInputOutput, ConvertManyOnInputOutput,
|
||||
::testing::Combine(
|
||||
::testing::Values(std::vector<ov::Shape>{ov::Shape{5, 5, 5, 5}}),
|
||||
::testing::ValuesIn(types_ConvertManyIO),
|
||||
::testing::Values(5), // sinh + subgraph + reorders for sinh
|
||||
::testing::Values(1),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
Convert::getTestCaseName);
|
||||
|
||||
} // namespace
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
@ -0,0 +1,25 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/eltwise_two_results.hpp"
|
||||
#include "common_test_utils/test_constants.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace test {
|
||||
namespace snippets {
|
||||
namespace {
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, EltwiseTwoResults,
|
||||
::testing::Combine(
|
||||
::testing::Values(ov::Shape {1, 64, 10, 10}),
|
||||
::testing::Values(ov::Shape {1, 64, 10, 1}),
|
||||
::testing::Values(4),
|
||||
::testing::Values(2),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
EltwiseTwoResults::getTestCaseName);
|
||||
|
||||
} // namespace
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
@ -0,0 +1,26 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/max_num_params_eltwise.hpp"
|
||||
#include "common_test_utils/test_constants.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace test {
|
||||
namespace snippets {
|
||||
namespace {
|
||||
// Note that we need these shapes to cover all cases of code emission (none/one/multiple of scalar/vector tiles)
|
||||
std::vector<ov::Shape> input_shapes {{1, 64, 10, 10}, {1, 1, 17, 37}, {1, 1, 1, 1}, {1, 1, 1, 7},
|
||||
{1, 1, 1, 128}, {1, 1, 1, 14}, {1, 1, 1, 16}, {1, 1, 1, 30}};
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, MaxNumParamsEltwiseSinh,
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(input_shapes),
|
||||
::testing::Values(12), // 10 Sinh after inputs + Subgraph + Concat
|
||||
::testing::Values(1),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
MaxNumParamsEltwiseSinh::getTestCaseName);
|
||||
|
||||
} // namespace
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
@ -10,25 +10,25 @@ namespace test {
|
||||
namespace snippets {
|
||||
namespace {
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, ThreeInputsEltwise,
|
||||
::testing::Combine(
|
||||
::testing::Values(ov::Shape {1, 64, 10, 10}),
|
||||
::testing::Values(ov::Shape {1, 64, 10, 1}),
|
||||
::testing::Values(ov::Shape {1, 1, 1, 10}),
|
||||
::testing::Values(2), // eltwises fuse only for non-broadcasted shapes
|
||||
::testing::Values(0), // SnippetsMarkSkipped disables tokenization for eltwise chains after inputs
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
ThreeInputsEltwise::getTestCaseName);
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, ThreeInputsEltwise,
|
||||
::testing::Combine(
|
||||
::testing::Values(ov::Shape {1, 64, 10, 10}),
|
||||
::testing::Values(ov::Shape {1, 64, 10, 1}),
|
||||
::testing::Values(ov::Shape {1, 1, 1, 10}),
|
||||
::testing::Values(2), // eltwises fuse only for non-broadcasted shapes
|
||||
::testing::Values(0), // SnippetsMarkSkipped disables tokenization for eltwise chains after inputs
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
ThreeInputsEltwise::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, ThreeInputsEltwiseSinh,
|
||||
::testing::Combine(
|
||||
::testing::Values(ov::Shape {1, 64, 10, 10}),
|
||||
::testing::Values(ov::Shape {1, 64, 10, 1}),
|
||||
::testing::Values(ov::Shape {1, 1, 1, 10}),
|
||||
::testing::Values(4), // Subgraph + 3 converts after inputs
|
||||
::testing::Values(1), // Subgraph is created, since the inputs are followed by converts
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
ThreeInputsEltwiseSinh::getTestCaseName);
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, ThreeInputsEltwiseSinh,
|
||||
::testing::Combine(
|
||||
::testing::Values(ov::Shape {1, 64, 10, 10}),
|
||||
::testing::Values(ov::Shape {1, 64, 10, 1}),
|
||||
::testing::Values(ov::Shape {1, 1, 1, 10}),
|
||||
::testing::Values(4), // Subgraph + 3 converts after inputs
|
||||
::testing::Values(1), // Subgraph is created, since the inputs are followed by converts
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
ThreeInputsEltwiseSinh::getTestCaseName);
|
||||
|
||||
} // namespace
|
||||
} // namespace snippets
|
||||
|
@ -0,0 +1,45 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/two_inputs_and_outputs.hpp"
|
||||
#include "common_test_utils/test_constants.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace test {
|
||||
namespace snippets {
|
||||
namespace {
|
||||
|
||||
const std::vector<std::vector<ov::Shape>> input_shapes = {
|
||||
{ {5, 5, 256, 1}, {5, 5, 256, 1} },
|
||||
{ {5, 5, 16, 35}, {5, 5, 16, 35} },
|
||||
{ {5, 5, 256, 1}, {5, 5, 256, 35} },
|
||||
{ {5, 5, 256, 1}, {5, 5, 1, 1} },
|
||||
|
||||
{ {5, 5, 16, 35}, {5, 5, 1, 1} },
|
||||
{ {5, 5, 16, 35}, {5, 5, 16, 1} },
|
||||
{ {5, 5, 5, 35}, {5, 5, 1, 35} },
|
||||
{ {5, 5, 16, 1}, {5, 5, 1, 35} },
|
||||
|
||||
{ {5, 5, 35, 16}, {5, 5, 35, 16} },
|
||||
{ {5, 5, 35, 16}, {5, 5, 1, 16} },
|
||||
|
||||
{ {5, 5, 35, 17}, {5, 5, 35, 17} },
|
||||
{ {5, 5, 35, 17}, {5, 5, 1, 17} },
|
||||
|
||||
{ {5, 5, 35, 18}, {5, 5, 35, 18} },
|
||||
{ {5, 5, 35, 18}, {5, 5, 1, 18} },
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, TwoInputsAndOutputs,
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(input_shapes),
|
||||
::testing::Values(4),
|
||||
::testing::Values(1),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
TwoInputsAndOutputs::getTestCaseName);
|
||||
|
||||
} // namespace
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
@ -731,4 +731,4 @@ const auto params_5D_dyn_param = ::testing::Combine(
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_5D_MemOrder_dyn_param, EltwiseLayerCPUTest, params_5D_dyn_param, EltwiseLayerCPUTest::getTestCaseName);
|
||||
|
||||
} // namespace
|
||||
} // namespace CPULayerTestsDefinitions
|
||||
} // namespace CPULayerTestsDefinitions
|
||||
|
2
src/plugins/intel_cpu/thirdparty/onednn
vendored
2
src/plugins/intel_cpu/thirdparty/onednn
vendored
@ -1 +1 @@
|
||||
Subproject commit 2a749c577f8a841a396d4bd46eaf311b7e7dc089
|
||||
Subproject commit f9e363fc1ff47191c7ddea63b19c7893965a786a
|
@ -18,6 +18,13 @@ typedef std::tuple<
|
||||
std::string // Target Device
|
||||
> AddParams;
|
||||
|
||||
typedef std::tuple<
|
||||
ov::Shape, // Input 0 Shape
|
||||
size_t, // Expected num nodes
|
||||
size_t, // Expected num subgraphs
|
||||
std::string // Target Device
|
||||
> AddConstParams;
|
||||
|
||||
class Add : public testing::WithParamInterface<ov::test::snippets::AddParams>,
|
||||
virtual public ov::test::SnippetsTestsCommon {
|
||||
public:
|
||||
@ -32,6 +39,14 @@ protected:
|
||||
void SetUp() override;
|
||||
};
|
||||
|
||||
class AddSinhConst : public testing::WithParamInterface<ov::test::snippets::AddConstParams>,
|
||||
virtual public ov::test::SnippetsTestsCommon {
|
||||
public:
|
||||
static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::AddConstParams> obj);
|
||||
protected:
|
||||
void SetUp() override;
|
||||
};
|
||||
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
@ -0,0 +1,76 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "shared_test_classes/base/snippets_test_utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace test {
|
||||
namespace snippets {
|
||||
|
||||
typedef std::tuple<
|
||||
std::vector<ov::Shape>, // InputShapes
|
||||
std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>>, // Input and Output data types for Converts
|
||||
size_t, // Expected num nodes
|
||||
size_t, // Expected num subgraphs
|
||||
std::string // Target Device
|
||||
> ConvertParams;
|
||||
|
||||
using parameters = std::vector<std::tuple<int32_t, int32_t, int32_t>>;
|
||||
|
||||
class Convert : public testing::WithParamInterface<ov::test::snippets::ConvertParams>,
|
||||
virtual public ov::test::SnippetsTestsCommon {
|
||||
public:
|
||||
static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::ConvertParams> obj);
|
||||
|
||||
protected:
|
||||
void SetUp() override;
|
||||
|
||||
void generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) override;
|
||||
virtual parameters generate_params_random() const;
|
||||
|
||||
ov::element::Type output_type = ov::element::f32;
|
||||
};
|
||||
|
||||
class ConvertInput : public Convert {
|
||||
protected:
|
||||
void SetUp() override;
|
||||
|
||||
parameters generate_params_random() const override;
|
||||
};
|
||||
|
||||
class ConvertOutput : public ConvertInput {
|
||||
protected:
|
||||
void SetUp() override;
|
||||
};
|
||||
|
||||
class ConvertStub : public ConvertInput {
|
||||
protected:
|
||||
void SetUp() override;
|
||||
};
|
||||
|
||||
class ConvertPartialInputsAndResults : public ConvertInput {
|
||||
protected:
|
||||
void SetUp() override;
|
||||
};
|
||||
|
||||
class ConvertManyOnInputs : public ConvertInput {
|
||||
protected:
|
||||
void SetUp() override;
|
||||
};
|
||||
|
||||
class ConvertManyOnOutputs : public ConvertInput {
|
||||
protected:
|
||||
void SetUp() override;
|
||||
};
|
||||
|
||||
class ConvertManyOnInputOutput : public ConvertInput {
|
||||
protected:
|
||||
void SetUp() override;
|
||||
};
|
||||
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
@ -0,0 +1,33 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "shared_test_classes/base/snippets_test_utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace test {
|
||||
namespace snippets {
|
||||
|
||||
typedef std::tuple<
|
||||
ov::Shape, // Input 0 Shape
|
||||
ov::Shape, // Input 1 Shape
|
||||
size_t, // Expected num nodes
|
||||
size_t, // Expected num subgraphs
|
||||
std::string // Target Device
|
||||
> EltwiseTwoResultsParams;
|
||||
|
||||
class EltwiseTwoResults : public testing::WithParamInterface<ov::test::snippets::EltwiseTwoResultsParams>,
|
||||
virtual public ov::test::SnippetsTestsCommon {
|
||||
public:
|
||||
static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::EltwiseTwoResultsParams> obj);
|
||||
|
||||
protected:
|
||||
void SetUp() override;
|
||||
};
|
||||
|
||||
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
@ -0,0 +1,31 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "shared_test_classes/base/snippets_test_utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace test {
|
||||
namespace snippets {
|
||||
|
||||
typedef std::tuple<
|
||||
ov::Shape, // Input Shape All shapes are replicated
|
||||
size_t, // Expected num nodes
|
||||
size_t, // Expected num subgraphs
|
||||
std::string // Target Device
|
||||
> MaxNumParamsEltwiseParams;
|
||||
|
||||
class MaxNumParamsEltwiseSinh : public testing::WithParamInterface<ov::test::snippets::MaxNumParamsEltwiseParams>,
|
||||
virtual public ov::test::SnippetsTestsCommon {
|
||||
public:
|
||||
static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::MaxNumParamsEltwiseParams> obj);
|
||||
|
||||
protected:
|
||||
void SetUp() override;
|
||||
};
|
||||
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
@ -0,0 +1,31 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "shared_test_classes/base/snippets_test_utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace test {
|
||||
namespace snippets {
|
||||
|
||||
typedef std::tuple<
|
||||
std::vector<ov::Shape>, // Input Shape All shapes
|
||||
size_t, // Expected num nodes
|
||||
size_t, // Expected num subgraphs
|
||||
std::string // Target Device
|
||||
> TwoInputsAndOutputsParams;
|
||||
|
||||
class TwoInputsAndOutputs : public testing::WithParamInterface<ov::test::snippets::TwoInputsAndOutputsParams>,
|
||||
virtual public ov::test::SnippetsTestsCommon {
|
||||
public:
|
||||
static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::TwoInputsAndOutputsParams> obj);
|
||||
|
||||
protected:
|
||||
void SetUp() override;
|
||||
};
|
||||
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
@ -10,38 +10,61 @@ namespace ov {
|
||||
namespace test {
|
||||
namespace snippets {
|
||||
|
||||
std::string Add::getTestCaseName(testing::TestParamInfo<ov::test::snippets::AddParams> obj) {
|
||||
ov::Shape inputShapes0, inputShapes1, newInputShapes;
|
||||
std::string targetDevice;
|
||||
size_t num_nodes, num_subgraphs;
|
||||
std::tie(inputShapes0, inputShapes1, num_nodes, num_subgraphs, targetDevice) = obj.param;
|
||||
std::string Add::getTestCaseName(testing::TestParamInfo<ov::test::snippets::AddParams> obj) {
|
||||
ov::Shape inputShapes0, inputShapes1, newInputShapes;
|
||||
std::string targetDevice;
|
||||
size_t num_nodes, num_subgraphs;
|
||||
std::tie(inputShapes0, inputShapes1, num_nodes, num_subgraphs, targetDevice) = obj.param;
|
||||
|
||||
std::ostringstream result;
|
||||
result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes0) << "_";
|
||||
result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes1) << "_";
|
||||
result << "#N=" << num_nodes << "_";
|
||||
result << "#S=" << num_subgraphs << "_";
|
||||
result << "targetDevice=" << targetDevice;
|
||||
return result.str();
|
||||
}
|
||||
std::ostringstream result;
|
||||
result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes0) << "_";
|
||||
result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes1) << "_";
|
||||
result << "#N=" << num_nodes << "_";
|
||||
result << "#S=" << num_subgraphs << "_";
|
||||
result << "targetDevice=" << targetDevice;
|
||||
return result.str();
|
||||
}
|
||||
|
||||
void Add::SetUp() {
|
||||
ov::Shape inputShape0, inputShape1;
|
||||
std::tie(inputShape0, inputShape1, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
|
||||
init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}});
|
||||
void Add::SetUp() {
|
||||
ov::Shape inputShape0, inputShape1;
|
||||
std::tie(inputShape0, inputShape1, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
|
||||
init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}});
|
||||
|
||||
auto f = ov::test::snippets::AddFunction({inputShape0, inputShape1});
|
||||
function = f.getOriginal();
|
||||
}
|
||||
auto f = ov::test::snippets::AddFunction({inputShape0, inputShape1});
|
||||
function = f.getOriginal();
|
||||
}
|
||||
|
||||
void AddSinh::SetUp() {
|
||||
ov::Shape inputShape0, inputShape1;
|
||||
std::tie(inputShape0, inputShape1, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
|
||||
init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}});
|
||||
void AddSinh::SetUp() {
|
||||
ov::Shape inputShape0, inputShape1;
|
||||
std::tie(inputShape0, inputShape1, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
|
||||
init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}});
|
||||
|
||||
auto f = ov::test::snippets::AddSinhFunction({inputShape0, inputShape1});
|
||||
function = f.getOriginal();
|
||||
}
|
||||
auto f = ov::test::snippets::AddSinhFunction({inputShape0, inputShape1});
|
||||
function = f.getOriginal();
|
||||
}
|
||||
|
||||
std::string AddSinhConst::getTestCaseName(testing::TestParamInfo<ov::test::snippets::AddConstParams> obj) {
|
||||
ov::Shape inputShapes, newInputShapes;
|
||||
std::string targetDevice;
|
||||
size_t num_nodes, num_subgraphs;
|
||||
std::tie(inputShapes, num_nodes, num_subgraphs, targetDevice) = obj.param;
|
||||
|
||||
std::ostringstream result;
|
||||
result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes) << "_";
|
||||
result << "#N=" << num_nodes << "_";
|
||||
result << "#S=" << num_subgraphs << "_";
|
||||
result << "targetDevice=" << targetDevice;
|
||||
return result.str();
|
||||
}
|
||||
|
||||
void AddSinhConst::SetUp() {
|
||||
ov::Shape inputShape;
|
||||
std::tie(inputShape, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
|
||||
init_input_shapes({{{}, {inputShape, }}});
|
||||
|
||||
auto f = ov::test::snippets::AddSinhConstFunction({inputShape});
|
||||
function = f.getOriginal();
|
||||
}
|
||||
|
||||
TEST_P(Add, CompareWithRefImpl) {
|
||||
run();
|
||||
@ -53,6 +76,11 @@ TEST_P(AddSinh, CompareWithRefImpl) {
|
||||
validateNumSubgraphs();
|
||||
}
|
||||
|
||||
TEST_P(AddSinhConst, CompareWithRefImpl) {
|
||||
run();
|
||||
validateNumSubgraphs();
|
||||
}
|
||||
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
||||
|
231
src/tests/functional/plugin/shared/src/snippets/convert.cpp
Normal file
231
src/tests/functional/plugin/shared/src/snippets/convert.cpp
Normal file
@ -0,0 +1,231 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "common_test_utils/common_utils.hpp"
|
||||
#include "snippets/convert.hpp"
|
||||
#include "subgraph_converts.hpp"
|
||||
#include "common_test_utils/ov_tensor_utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace test {
|
||||
namespace snippets {
|
||||
|
||||
std::string Convert::getTestCaseName(testing::TestParamInfo<ov::test::snippets::ConvertParams> obj) {
|
||||
std::vector<ov::Shape> inputShape;
|
||||
std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
|
||||
std::string targetDevice;
|
||||
size_t num_nodes, num_subgraphs;
|
||||
std::tie(inputShape, types, num_nodes, num_subgraphs, targetDevice) = obj.param;
|
||||
|
||||
std::ostringstream result;
|
||||
result << "IS=";
|
||||
for (const auto& sh : inputShape)
|
||||
result << CommonTestUtils::vec2str(sh) << "_";
|
||||
result << "IT=" << CommonTestUtils::vec2str(types.first) << "_";
|
||||
result << "OT=" << CommonTestUtils::vec2str(types.second) << "_";
|
||||
result << "#N=" << num_nodes << "_";
|
||||
result << "#S=" << num_subgraphs << "_";
|
||||
result << "targetDevice=" << targetDevice;
|
||||
return result.str();
|
||||
}
|
||||
|
||||
void Convert::SetUp() {
|
||||
std::vector<ov::Shape> inputShape;
|
||||
std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
|
||||
std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
|
||||
init_input_shapes(static_shapes_to_test_representation(inputShape));
|
||||
|
||||
auto f = ov::test::snippets::ConvertFunction(inputShape, types.first[0], types.second[0]);
|
||||
function = f.getOriginal();
|
||||
output_type = types.second.front();
|
||||
}
|
||||
|
||||
parameters Convert::generate_params_random() const {
|
||||
int32_t startFrom, range, resolution = 5;
|
||||
switch (output_type) {
|
||||
case ov::element::f32:
|
||||
case ov::element::i32:
|
||||
case ov::element::bf16:
|
||||
startFrom = -10;
|
||||
range = 20;
|
||||
break;
|
||||
case ov::element::u8:
|
||||
startFrom = -10;
|
||||
range = 20;
|
||||
break;
|
||||
case ov::element::i8:
|
||||
startFrom = 117;
|
||||
range = 20;
|
||||
break;
|
||||
default:
|
||||
startFrom = 0;
|
||||
range = 10;
|
||||
}
|
||||
return {{ startFrom, range, resolution }};
|
||||
}
|
||||
|
||||
void Convert::generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) {
|
||||
inputs.clear();
|
||||
const auto& funcInputs = function->inputs();
|
||||
const auto params = generate_params_random();
|
||||
if (params.size() != funcInputs.size()) {
|
||||
IE_THROW() << "Incorrect count of parameters for random generation and inputs of function!";
|
||||
}
|
||||
|
||||
for (int i = 0; i < funcInputs.size(); ++i) {
|
||||
const auto& funcInput = funcInputs[i];
|
||||
ov::Tensor tensor;
|
||||
int32_t startFrom, range, resolution;
|
||||
std::tie(startFrom, range, resolution) = params[i];
|
||||
tensor = ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i],
|
||||
range, startFrom, resolution);
|
||||
inputs.insert({funcInput.get_node_shared_ptr(), tensor});
|
||||
}
|
||||
}
|
||||
|
||||
void ConvertInput::SetUp() {
|
||||
std::vector<ov::Shape> inputShape;
|
||||
std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
|
||||
std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
|
||||
init_input_shapes(static_shapes_to_test_representation(inputShape));
|
||||
|
||||
auto f = ov::test::snippets::ConvertInputFunction(inputShape, types.first[0], types.second[0]);
|
||||
function = f.getOriginal();
|
||||
}
|
||||
|
||||
parameters ConvertInput::generate_params_random() const {
|
||||
parameters params;
|
||||
const auto& funcInputs = function->inputs();
|
||||
for (int i = 0; i < funcInputs.size(); ++i) {
|
||||
int32_t startFrom, range, resolution = 1;
|
||||
switch (funcInputs[i].get_element_type()) {
|
||||
case ov::element::f32:
|
||||
case ov::element::bf16:
|
||||
startFrom = -10;
|
||||
range = 20;
|
||||
resolution = 7;
|
||||
break;
|
||||
case ov::element::i32:
|
||||
case ov::element::i8:
|
||||
startFrom = -10;
|
||||
range = 20;
|
||||
break;
|
||||
case ov::element::u8:
|
||||
startFrom = 10;
|
||||
range = 20;
|
||||
break;
|
||||
default:
|
||||
startFrom = 0;
|
||||
range = 10;
|
||||
}
|
||||
params.push_back({ startFrom, range, resolution });
|
||||
}
|
||||
return params;
|
||||
}
|
||||
|
||||
void ConvertOutput::SetUp() {
|
||||
std::vector<ov::Shape> inputShape;
|
||||
std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
|
||||
std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
|
||||
init_input_shapes(static_shapes_to_test_representation(inputShape));
|
||||
|
||||
auto f = ov::test::snippets::ConvertOutputFunction(inputShape, types.first[0], types.second[0]);
|
||||
function = f.getOriginal();
|
||||
output_type = types.second.front();
|
||||
}
|
||||
|
||||
void ConvertStub::SetUp() {
|
||||
std::vector<ov::Shape> inputShape;
|
||||
std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
|
||||
std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
|
||||
init_input_shapes(static_shapes_to_test_representation(inputShape));
|
||||
|
||||
auto f = ov::test::snippets::ConvertStubFunction(inputShape, types.first[0], types.second[0]);
|
||||
function = f.getOriginal();
|
||||
output_type = types.second.front();
|
||||
}
|
||||
|
||||
void ConvertPartialInputsAndResults::SetUp() {
|
||||
std::vector<ov::Shape> inputShape;
|
||||
std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
|
||||
std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
|
||||
init_input_shapes(static_shapes_to_test_representation(inputShape));
|
||||
|
||||
auto f = ov::test::snippets::ConvertPartialInputsAndResultsFunction(inputShape, types.first, types.second);
|
||||
function = f.getOriginal();
|
||||
}
|
||||
|
||||
void ConvertManyOnInputs::SetUp() {
|
||||
std::vector<ov::Shape> inputShape;
|
||||
std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
|
||||
std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
|
||||
init_input_shapes(static_shapes_to_test_representation(inputShape));
|
||||
|
||||
auto f = ov::test::snippets::ConvertManyOnInputsFunction(inputShape, types.first);
|
||||
function = f.getOriginal();
|
||||
}
|
||||
|
||||
void ConvertManyOnOutputs::SetUp() {
|
||||
std::vector<ov::Shape> inputShape;
|
||||
std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
|
||||
std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
|
||||
init_input_shapes(static_shapes_to_test_representation(inputShape));
|
||||
|
||||
auto f = ov::test::snippets::ConvertManyOnOutputsFunction(inputShape, types.first);
|
||||
function = f.getOriginal();
|
||||
}
|
||||
|
||||
void ConvertManyOnInputOutput::SetUp() {
|
||||
std::vector<ov::Shape> inputShape;
|
||||
std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
|
||||
std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
|
||||
init_input_shapes(static_shapes_to_test_representation(inputShape));
|
||||
|
||||
auto f = ov::test::snippets::ConvertManyOnInputOutputFunction(inputShape, types.first, types.second);
|
||||
function = f.getOriginal();
|
||||
}
|
||||
|
||||
TEST_P(Convert, CompareWithRefImpl) {
|
||||
run();
|
||||
validateNumSubgraphs();
|
||||
}
|
||||
|
||||
TEST_P(ConvertInput, CompareWithRefImpl) {
|
||||
run();
|
||||
validateNumSubgraphs();
|
||||
}
|
||||
|
||||
TEST_P(ConvertOutput, CompareWithRefImpl) {
|
||||
run();
|
||||
validateNumSubgraphs();
|
||||
}
|
||||
|
||||
TEST_P(ConvertStub, CompareWithRefImpl) {
|
||||
run();
|
||||
validateNumSubgraphs();
|
||||
}
|
||||
|
||||
TEST_P(ConvertPartialInputsAndResults, CompareWithRefImpl) {
|
||||
run();
|
||||
validateNumSubgraphs();
|
||||
}
|
||||
|
||||
TEST_P(ConvertManyOnInputs, CompareWithRefImpl) {
|
||||
run();
|
||||
validateNumSubgraphs();
|
||||
}
|
||||
|
||||
TEST_P(ConvertManyOnOutputs, CompareWithRefImpl) {
|
||||
run();
|
||||
validateNumSubgraphs();
|
||||
}
|
||||
|
||||
TEST_P(ConvertManyOnInputOutput, CompareWithRefImpl) {
|
||||
run();
|
||||
validateNumSubgraphs();
|
||||
}
|
||||
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
@ -0,0 +1,44 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "common_test_utils/common_utils.hpp"
|
||||
#include "snippets/eltwise_two_results.hpp"
|
||||
#include "subgraph_simple.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace test {
|
||||
namespace snippets {
|
||||
|
||||
std::string EltwiseTwoResults::getTestCaseName(testing::TestParamInfo<ov::test::snippets::EltwiseTwoResultsParams> obj) {
|
||||
ov::Shape inputShapes0, inputShapes1;
|
||||
std::string targetDevice;
|
||||
size_t num_nodes, num_subgraphs;
|
||||
std::tie(inputShapes0, inputShapes1, num_nodes, num_subgraphs, targetDevice) = obj.param;
|
||||
|
||||
std::ostringstream result;
|
||||
result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes0) << "_";
|
||||
result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes1) << "_";
|
||||
result << "#N=" << num_nodes << "_";
|
||||
result << "#S=" << num_subgraphs << "_";
|
||||
result << "targetDevice=" << targetDevice;
|
||||
return result.str();
|
||||
}
|
||||
|
||||
void EltwiseTwoResults::SetUp() {
|
||||
ov::Shape inputShape0, inputShape1;
|
||||
std::tie(inputShape0, inputShape1, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
|
||||
init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}});
|
||||
|
||||
auto f = ov::test::snippets::EltwiseTwoResultsFunction({inputShape0, inputShape1});
|
||||
function = f.getOriginal();
|
||||
}
|
||||
|
||||
TEST_P(EltwiseTwoResults, CompareWithRefImpl) {
|
||||
run();
|
||||
validateNumSubgraphs();
|
||||
}
|
||||
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
@ -0,0 +1,49 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "common_test_utils/common_utils.hpp"
|
||||
#include "snippets/max_num_params_eltwise.hpp"
|
||||
#include "subgraph_simple.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace test {
|
||||
namespace snippets {
|
||||
|
||||
std::string MaxNumParamsEltwiseSinh::getTestCaseName(testing::TestParamInfo<ov::test::snippets::MaxNumParamsEltwiseParams> obj) {
|
||||
ov::Shape inputShapes;
|
||||
std::string targetDevice;
|
||||
size_t num_nodes, num_subgraphs;
|
||||
std::tie(inputShapes, num_nodes, num_subgraphs, targetDevice) = obj.param;
|
||||
|
||||
std::ostringstream result;
|
||||
result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes) << "_";
|
||||
result << "#N=" << num_nodes << "_";
|
||||
result << "#S=" << num_subgraphs << "_";
|
||||
result << "targetDevice=" << targetDevice;
|
||||
return result.str();
|
||||
}
|
||||
|
||||
void MaxNumParamsEltwiseSinh::SetUp() {
|
||||
ov::Shape inputShape;
|
||||
std::tie(inputShape, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
|
||||
std::vector<ov::Shape> expandedShapes(10, inputShape);
|
||||
std::vector<InputShape> input_shapes;
|
||||
for (const auto& s : expandedShapes) {
|
||||
input_shapes.emplace_back(InputShape {{}, {s, }});
|
||||
}
|
||||
|
||||
init_input_shapes(input_shapes);
|
||||
|
||||
auto f = ov::test::snippets::EltwiseMaxNumParamsSinhFunction(expandedShapes);
|
||||
function = f.getOriginal();
|
||||
}
|
||||
|
||||
TEST_P(MaxNumParamsEltwiseSinh, CompareWithRefImpl) {
|
||||
run();
|
||||
validateNumSubgraphs();
|
||||
}
|
||||
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
@ -10,42 +10,42 @@ namespace ov {
|
||||
namespace test {
|
||||
namespace snippets {
|
||||
|
||||
std::string ThreeInputsEltwise::getTestCaseName(testing::TestParamInfo<ov::test::snippets::ThreeInputsEltwiseParams> obj) {
|
||||
ov::Shape inputShapes0, inputShapes1, inputShapes2;
|
||||
std::string targetDevice;
|
||||
size_t num_nodes, num_subgraphs;
|
||||
std::tie(inputShapes0, inputShapes1, inputShapes2,
|
||||
num_nodes, num_subgraphs, targetDevice) = obj.param;
|
||||
std::string ThreeInputsEltwise::getTestCaseName(testing::TestParamInfo<ov::test::snippets::ThreeInputsEltwiseParams> obj) {
|
||||
ov::Shape inputShapes0, inputShapes1, inputShapes2;
|
||||
std::string targetDevice;
|
||||
size_t num_nodes, num_subgraphs;
|
||||
std::tie(inputShapes0, inputShapes1, inputShapes2,
|
||||
num_nodes, num_subgraphs, targetDevice) = obj.param;
|
||||
|
||||
std::ostringstream result;
|
||||
result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes0) << "_";
|
||||
result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes1) << "_";
|
||||
result << "IS[2]=" << CommonTestUtils::vec2str(inputShapes2) << "_";
|
||||
result << "#N=" << num_nodes << "_";
|
||||
result << "#S=" << num_subgraphs << "_";
|
||||
result << "targetDevice=" << targetDevice;
|
||||
return result.str();
|
||||
}
|
||||
std::ostringstream result;
|
||||
result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes0) << "_";
|
||||
result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes1) << "_";
|
||||
result << "IS[2]=" << CommonTestUtils::vec2str(inputShapes2) << "_";
|
||||
result << "#N=" << num_nodes << "_";
|
||||
result << "#S=" << num_subgraphs << "_";
|
||||
result << "targetDevice=" << targetDevice;
|
||||
return result.str();
|
||||
}
|
||||
|
||||
void ThreeInputsEltwise::SetUp() {
|
||||
ov::Shape inputShape0, inputShape1, inputShape2;
|
||||
std::tie(inputShape0, inputShape1, inputShape2,
|
||||
ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
|
||||
init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}, {{}, {inputShape2, }}});
|
||||
void ThreeInputsEltwise::SetUp() {
|
||||
ov::Shape inputShape0, inputShape1, inputShape2;
|
||||
std::tie(inputShape0, inputShape1, inputShape2,
|
||||
ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
|
||||
init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}, {{}, {inputShape2, }}});
|
||||
|
||||
auto f = ov::test::snippets::EltwiseThreeInputsFunction({inputShape0, inputShape1, inputShape2});
|
||||
function = f.getOriginal();
|
||||
}
|
||||
auto f = ov::test::snippets::EltwiseThreeInputsFunction({inputShape0, inputShape1, inputShape2});
|
||||
function = f.getOriginal();
|
||||
}
|
||||
|
||||
void ThreeInputsEltwiseSinh::SetUp() {
|
||||
ov::Shape inputShape0, inputShape1, inputShape2;
|
||||
std::tie(inputShape0, inputShape1, inputShape2,
|
||||
ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
|
||||
init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}, {{}, {inputShape2, }}});
|
||||
void ThreeInputsEltwiseSinh::SetUp() {
|
||||
ov::Shape inputShape0, inputShape1, inputShape2;
|
||||
std::tie(inputShape0, inputShape1, inputShape2,
|
||||
ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
|
||||
init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}, {{}, {inputShape2, }}});
|
||||
|
||||
auto f = ov::test::snippets::EltwiseThreeInputsSinhFunction({inputShape0, inputShape1, inputShape2});
|
||||
function = f.getOriginal();
|
||||
}
|
||||
auto f = ov::test::snippets::EltwiseThreeInputsSinhFunction({inputShape0, inputShape1, inputShape2});
|
||||
function = f.getOriginal();
|
||||
}
|
||||
|
||||
TEST_P(ThreeInputsEltwise, CompareWithRefImpl) {
|
||||
run();
|
||||
|
@ -0,0 +1,43 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "common_test_utils/common_utils.hpp"
|
||||
#include "snippets/two_inputs_and_outputs.hpp"
|
||||
#include "subgraph_simple.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace test {
|
||||
namespace snippets {
|
||||
|
||||
std::string TwoInputsAndOutputs::getTestCaseName(testing::TestParamInfo<ov::test::snippets::TwoInputsAndOutputsParams> obj) {
|
||||
std::vector<ov::Shape> inputShapes;
|
||||
std::string targetDevice;
|
||||
size_t num_nodes, num_subgraphs;
|
||||
std::tie(inputShapes, num_nodes, num_subgraphs, targetDevice) = obj.param;
|
||||
|
||||
std::ostringstream result;
|
||||
for (auto i = 0; i < inputShapes.size(); i++)
|
||||
result << "IS[" << i << "]=" << CommonTestUtils::vec2str(inputShapes[i]) << "_";
|
||||
result << "#N=" << num_nodes << "_";
|
||||
result << "#S=" << num_subgraphs << "_";
|
||||
result << "targetDevice=" << targetDevice;
|
||||
return result.str();
|
||||
}
|
||||
|
||||
void TwoInputsAndOutputs::SetUp() {
|
||||
std::vector<ov::Shape> inputShape;
|
||||
std::tie(inputShape, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
|
||||
init_input_shapes(static_shapes_to_test_representation(inputShape));
|
||||
auto f = ov::test::snippets::TwoInputsAndOutputsFunction(inputShape);
|
||||
function = f.getOriginal();
|
||||
}
|
||||
|
||||
TEST_P(TwoInputsAndOutputs, CompareWithRefImpl) {
|
||||
run();
|
||||
validateNumSubgraphs();
|
||||
}
|
||||
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
@ -3,11 +3,16 @@
|
||||
//
|
||||
|
||||
#include "shared_test_classes/base/snippets_test_utils.hpp"
|
||||
#include "functional_test_utils/skip_tests_config.hpp"
|
||||
#include "exec_graph_info.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace test {
|
||||
void SnippetsTestsCommon::validateNumSubgraphs() {
|
||||
bool isCurrentTestDisabled = FuncTestUtils::SkipTestsConfig::currentTestIsDisabled();
|
||||
if (isCurrentTestDisabled)
|
||||
GTEST_SKIP() << "Disabled test due to configuration" << std::endl;
|
||||
|
||||
const auto& compiled_model = compiledModel.get_runtime_model();
|
||||
size_t num_subgraphs = 0;
|
||||
size_t num_nodes = 0;
|
||||
|
@ -0,0 +1,214 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ngraph/ngraph.hpp"
|
||||
#include "./snippets_helpers.hpp"
|
||||
|
||||
/* This file contains definitions of relatively simple functions (models) that will be used
|
||||
* to test snippets-specific behavior. All the functions are expected to be direct descendants of
|
||||
* SnippetsFunctionBase, so their constructors take only one (inputShapes) argument.
|
||||
*/
|
||||
|
||||
namespace ov {
|
||||
namespace test {
|
||||
namespace snippets {
|
||||
/// The most trivial graph, just one Convert.
|
||||
/// Tokenized simply by starting subgraph.
|
||||
// in1
|
||||
// Convert
|
||||
// Result
|
||||
class ConvertFunction : public SnippetsFunctionBase {
|
||||
public:
|
||||
explicit ConvertFunction(const std::vector<Shape>& inputShapes,
|
||||
const ov::element::Type inType = ov::element::f32,
|
||||
const ov::element::Type outType = ov::element::u8)
|
||||
: SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) {
|
||||
NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes");
|
||||
}
|
||||
protected:
|
||||
std::shared_ptr<ov::Model> initOriginal() const override;
|
||||
std::shared_ptr<ov::Model> initReference() const override;
|
||||
|
||||
ov::element::Type inType;
|
||||
ov::element::Type outType;
|
||||
};
|
||||
|
||||
|
||||
/// The one of the input of Add is Convert
|
||||
/// Tokenized simply by starting subgraph.
|
||||
// in1
|
||||
// Convert in2
|
||||
// Add
|
||||
// Result
|
||||
class ConvertInputFunction : public SnippetsFunctionBase {
|
||||
public:
|
||||
explicit ConvertInputFunction(const std::vector<Shape>& inputShapes,
|
||||
const ov::element::Type inType = ov::element::f32,
|
||||
const ov::element::Type outType = ov::element::u8)
|
||||
: SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) {
|
||||
NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
|
||||
}
|
||||
protected:
|
||||
std::shared_ptr<ov::Model> initOriginal() const override;
|
||||
std::shared_ptr<ov::Model> initReference() const override;
|
||||
|
||||
ov::element::Type inType;
|
||||
ov::element::Type outType;
|
||||
};
|
||||
|
||||
/// The output of Sub is Convert
|
||||
/// Tokenized simply by starting subgraph.
|
||||
// in1 in2
|
||||
// Sub
|
||||
// Convert
|
||||
// Result
|
||||
class ConvertOutputFunction : public SnippetsFunctionBase {
|
||||
public:
|
||||
explicit ConvertOutputFunction(const std::vector<Shape>& inputShapes,
|
||||
const ov::element::Type inType = ov::element::f32,
|
||||
const ov::element::Type outType = ov::element::i8)
|
||||
: SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) {
|
||||
NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
|
||||
}
|
||||
protected:
|
||||
std::shared_ptr<ov::Model> initOriginal() const override;
|
||||
std::shared_ptr<ov::Model> initReference() const override;
|
||||
|
||||
ov::element::Type inType;
|
||||
ov::element::Type outType;
|
||||
};
|
||||
|
||||
|
||||
/// There are 2 subgraphs: Add + Convert(Stub) and Relu
|
||||
/// Tokenized simply by starting subgraph.
|
||||
// in1 in2 in1 in2
|
||||
// Add Subgraph
|
||||
// Convert -> |
|
||||
// Relu Subgraph
|
||||
// Result Result
|
||||
class ConvertStubFunction : public SnippetsFunctionBase {
|
||||
public:
|
||||
explicit ConvertStubFunction(const std::vector<Shape>& inputShapes,
|
||||
const ov::element::Type inType = ov::element::f32,
|
||||
const ov::element::Type outType = ov::element::i8)
|
||||
: SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) {
|
||||
NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
|
||||
}
|
||||
protected:
|
||||
std::shared_ptr<ov::Model> initOriginal() const override;
|
||||
std::shared_ptr<ov::Model> initReference() const override;
|
||||
|
||||
ov::element::Type inType;
|
||||
ov::element::Type outType;
|
||||
};
|
||||
|
||||
|
||||
/// Not all Inputs and Results have Convert
|
||||
/// Tokenized simply by starting subgraph.
|
||||
// in1 in2
|
||||
// Convert Convert
|
||||
// Add
|
||||
// Relu in3
|
||||
// Convert Sub
|
||||
// Result1 Unsqueeze <- It's to avoid many result output for subgraph (it's a limitation of collapsing)
|
||||
// Result2
|
||||
class ConvertPartialInputsAndResultsFunction : public SnippetsFunctionBase {
|
||||
public:
|
||||
explicit ConvertPartialInputsAndResultsFunction(const std::vector<Shape>& inputShapes,
|
||||
const std::vector<ov::element::Type>& inTypes = {ov::element::f32},
|
||||
const std::vector<ov::element::Type>& outTypes = {ov::element::f32})
|
||||
: SnippetsFunctionBase(inputShapes), inTypes(inTypes), outTypes(outTypes) {
|
||||
NGRAPH_CHECK(input_shapes.size() == 3, "Got invalid number of input shapes");
|
||||
}
|
||||
protected:
|
||||
std::shared_ptr<ov::Model> initOriginal() const override;
|
||||
std::shared_ptr<ov::Model> initReference() const override;
|
||||
|
||||
std::vector<ov::element::Type> inTypes;
|
||||
std::vector<ov::element::Type> outTypes;
|
||||
};
|
||||
|
||||
/// Convert Sequence on input
|
||||
/// Tokenized simply by starting subgraph.
|
||||
// in in
|
||||
// Stub Stub
|
||||
// Convert |
|
||||
// Convert -> Subgraph
|
||||
// Convert |
|
||||
// Relu Result
|
||||
// Result
|
||||
class ConvertManyOnInputsFunction : public SnippetsFunctionBase {
|
||||
public:
|
||||
explicit ConvertManyOnInputsFunction(const std::vector<Shape>& inputShapes, const std::vector<ov::element::Type>& types)
|
||||
: SnippetsFunctionBase(inputShapes), types(types) {
|
||||
NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes");
|
||||
NGRAPH_CHECK(types.size() > 1, "Got invalid number of element types");
|
||||
}
|
||||
protected:
|
||||
std::shared_ptr<ov::Model> initOriginal() const override;
|
||||
std::shared_ptr<ov::Model> initReference() const override;
|
||||
|
||||
std::vector<ov::element::Type> types;
|
||||
};
|
||||
|
||||
/// Convert Sequence on output
|
||||
/// Tokenized simply by starting subgraph.
|
||||
// in in
|
||||
// Stub Stub
|
||||
// Relu |
|
||||
// Convert -> Subgraph
|
||||
// Convert |
|
||||
// Convert |
|
||||
// Result Result
|
||||
class ConvertManyOnOutputsFunction : public SnippetsFunctionBase {
|
||||
public:
|
||||
explicit ConvertManyOnOutputsFunction(const std::vector<Shape>& inputShapes, const std::vector<ov::element::Type>& types)
|
||||
: SnippetsFunctionBase(inputShapes), types(types) {
|
||||
NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes");
|
||||
NGRAPH_CHECK(types.size() > 1, "Got invalid number of element types");
|
||||
}
|
||||
protected:
|
||||
std::shared_ptr<ov::Model> initOriginal() const override;
|
||||
std::shared_ptr<ov::Model> initReference() const override;
|
||||
|
||||
std::vector<ov::element::Type> types;
|
||||
};
|
||||
|
||||
/// Convert Sequence on input and output
|
||||
/// Tokenized simply by starting subgraph.
|
||||
// in in
|
||||
// Stub Stub
|
||||
// Convert |
|
||||
// Convert |
|
||||
// Convert |
|
||||
// Relu -> Subgraph
|
||||
// Convert |
|
||||
// Convert |
|
||||
// Convert |
|
||||
// Result Result
|
||||
class ConvertManyOnInputOutputFunction : public SnippetsFunctionBase {
|
||||
public:
|
||||
explicit ConvertManyOnInputOutputFunction(const std::vector<Shape>& inputShapes,
|
||||
const std::vector<ov::element::Type>& inTypes,
|
||||
const std::vector<ov::element::Type>& outTypes)
|
||||
: SnippetsFunctionBase(inputShapes), inTypes(inTypes), outTypes(outTypes) {
|
||||
NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes");
|
||||
NGRAPH_CHECK(inTypes.size() > 1, "Got invalid number of input element types");
|
||||
NGRAPH_CHECK(outTypes.size() > 0, "Got invalid number of output element types");
|
||||
}
|
||||
protected:
|
||||
std::shared_ptr<ov::Model> initOriginal() const override;
|
||||
std::shared_ptr<ov::Model> initReference() const override;
|
||||
|
||||
std::vector<ov::element::Type> inTypes;
|
||||
std::vector<ov::element::Type> outTypes;
|
||||
};
|
||||
|
||||
|
||||
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
@ -7,6 +7,7 @@
|
||||
#include "ngraph/ngraph.hpp"
|
||||
#include "snippets_helpers.hpp"
|
||||
#include "subgraph_simple.hpp"
|
||||
#include "subgraph_converts.hpp"
|
||||
|
||||
/* This file provides lowered representations (after the generate() was calles) for some simple functions.
|
||||
* This is required to test snippets lowering and optimization passes. All the functions are expected to be direct
|
||||
@ -45,7 +46,7 @@ public:
|
||||
protected:
|
||||
std::shared_ptr<ov::Model> initLowered() const override;
|
||||
private:
|
||||
std::vector<Shape> broadcast_shapes;;
|
||||
std::vector<Shape> broadcast_shapes;
|
||||
};
|
||||
|
||||
} // namespace snippets
|
||||
|
@ -29,13 +29,14 @@ protected:
|
||||
std::shared_ptr<ov::Model> initOriginal() const override;
|
||||
std::shared_ptr<ov::Model> initReference() const override;
|
||||
};
|
||||
/// Add separated from inputs by Sin to WA CPU-specific disabling after inputs.
|
||||
/// Add separated from inputs by Sinh to WA CPU-specific disabling after inputs.
|
||||
/// Works because Sinh is not supported by tokenization yet.
|
||||
/// Tokenized simply by starting subgraph.
|
||||
// in1 in2
|
||||
// Sin Sinh
|
||||
// Sinh Sinh
|
||||
// Add
|
||||
// Result
|
||||
// todo: remove Sinh once "no subgraph after input" limitation is relaxed
|
||||
class AddSinhFunction : public SnippetsFunctionBase {
|
||||
public:
|
||||
explicit AddSinhFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
|
||||
@ -45,6 +46,21 @@ protected:
|
||||
std::shared_ptr<ov::Model> initOriginal() const override;
|
||||
std::shared_ptr<ov::Model> initReference() const override;
|
||||
};
|
||||
/// Like AddSinh but with a constant second input (and no sinh on in)
|
||||
// in1 in2
|
||||
// Sin Sinh
|
||||
// Add
|
||||
// Result
|
||||
// todo: remove Sinh once "no subgraph after input" limitation is relaxed
|
||||
class AddSinhConstFunction : public SnippetsFunctionBase {
|
||||
public:
|
||||
explicit AddSinhConstFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
|
||||
NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes");
|
||||
}
|
||||
protected:
|
||||
std::shared_ptr<ov::Model> initOriginal() const override;
|
||||
// std::shared_ptr<ov::Model> initReference() const override;
|
||||
};
|
||||
/// Simple Eltwise graph fully convertible to Subgraph.
|
||||
/// Tokenized simply by attaching eltwises.
|
||||
// in1 in2
|
||||
@ -77,6 +93,7 @@ protected:
|
||||
};
|
||||
/// EltwiseFunctionThreeInputs with Sinh after inputs to to WA CPU-specific disabling after inputs
|
||||
/// See AddSinh for details.
|
||||
// todo: remove Sinh once "no subgraph after input" limitation is relaxed
|
||||
class EltwiseThreeInputsSinhFunction : public SnippetsFunctionBase {
|
||||
public:
|
||||
explicit EltwiseThreeInputsSinhFunction(const std::vector<Shape>& inputShapes) :
|
||||
@ -86,6 +103,24 @@ public:
|
||||
protected:
|
||||
std::shared_ptr<ov::Model> initOriginal() const override;
|
||||
};
|
||||
/// Eltwise graph with 10 inputs and 2 outputs.
|
||||
/// Needed to test for a max number of inputs+outputs allowed.
|
||||
// in1 in2 in3 ... in10
|
||||
// Sinh Sinh Sinh ...Sinh
|
||||
// ........................
|
||||
// Subtract Power
|
||||
// \ Sinh
|
||||
// Result
|
||||
// todo: remove Sinh once "no subgraph after input" limitation is relaxed
|
||||
class EltwiseMaxNumParamsSinhFunction : public SnippetsFunctionBase {
|
||||
public:
|
||||
explicit EltwiseMaxNumParamsSinhFunction(const std::vector<Shape>& inputShapes) :
|
||||
SnippetsFunctionBase(inputShapes) {
|
||||
NGRAPH_CHECK(input_shapes.size() == 10, "Got invalid number of input shapes");
|
||||
}
|
||||
protected:
|
||||
std::shared_ptr<ov::Model> initOriginal() const override;
|
||||
};
|
||||
/// MatMul with two eltwise branches joined with Add just before the Result.
|
||||
/// Tokenized by attaching eltwises to separate subgraphs, and then joining them together.
|
||||
// in1 in2
|
||||
@ -125,7 +160,41 @@ protected:
|
||||
std::shared_ptr<ov::Model> initOriginal() const override;
|
||||
std::shared_ptr<ov::Model> initReference() const override;
|
||||
};
|
||||
|
||||
/// 2 results.
|
||||
/// So we have 2 subgraphs - Snippets don't support subgraphs with many results
|
||||
/// Also Output tensors have names to check correct copying output names
|
||||
// in1 in2
|
||||
// Sinh Sinh
|
||||
// Add
|
||||
// HSwish Result
|
||||
// Relu
|
||||
// Result
|
||||
class EltwiseTwoResultsFunction : public SnippetsFunctionBase {
|
||||
public:
|
||||
explicit EltwiseTwoResultsFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
|
||||
NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
|
||||
}
|
||||
protected:
|
||||
std::shared_ptr<ov::Model> initOriginal() const override;
|
||||
std::shared_ptr<ov::Model> initReference() const override;
|
||||
};
|
||||
/// Two different Input and Outputs.
|
||||
/// This function is to check correct Broadcasting
|
||||
// in1 in2
|
||||
// Sin Sin
|
||||
// HSwish /
|
||||
// Result Add
|
||||
// Relu
|
||||
// Sin
|
||||
// Result
|
||||
class TwoInputsAndOutputsFunction : public SnippetsFunctionBase {
|
||||
public:
|
||||
explicit TwoInputsAndOutputsFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
|
||||
NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
|
||||
}
|
||||
protected:
|
||||
std::shared_ptr<ov::Model> initOriginal() const override;
|
||||
};
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
||||
|
@ -0,0 +1,241 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "subgraph_converts.hpp"
|
||||
#include "common_test_utils/data_utils.hpp"
|
||||
#include <snippets/op/convert_truncation.hpp>
|
||||
#include <snippets/op/subgraph.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace test {
|
||||
namespace snippets {
|
||||
|
||||
std::shared_ptr<ov::Node> createRollAsStub(const std::shared_ptr<ov::Node>& parent) {
|
||||
auto shift = std::make_shared<op::v0::Constant>(ov::element::i32, Shape{1}, std::vector<int>{1});
|
||||
auto axes = std::make_shared<op::v0::Constant>(ov::element::i32, Shape{1}, std::vector<int>{0});
|
||||
return std::make_shared<op::v7::Roll>(parent->output(0), shift, axes);
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Model> ConvertFunction::initOriginal() const {
|
||||
auto data0 = std::make_shared<op::v0::Parameter>(inType, input_shapes[0]);
|
||||
auto stub = createRollAsStub(data0);
|
||||
auto convert = std::make_shared<op::v0::Convert>(stub, outType);
|
||||
return std::make_shared<ov::Model>(NodeVector{convert}, ParameterVector{data0});
|
||||
}
|
||||
std::shared_ptr<ov::Model> ConvertFunction::initReference() const {
|
||||
auto data0 = std::make_shared<op::v0::Parameter>(inType, input_shapes[0]);
|
||||
auto stub = createRollAsStub(data0);
|
||||
auto indata0 = std::make_shared<op::v0::Parameter>(inType, stub->get_shape());
|
||||
auto subgraph = std::make_shared<ngraph::snippets::op::Subgraph>(NodeVector{stub},
|
||||
std::make_shared<ov::Model>(NodeVector{std::make_shared<ngraph::snippets::op::ConvertTruncation>(indata0, outType)},
|
||||
ParameterVector{indata0}));
|
||||
return std::make_shared<ov::Model>(NodeVector{subgraph}, ParameterVector{data0});
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Model> ConvertInputFunction::initOriginal() const {
|
||||
auto data0 = std::make_shared<op::v0::Parameter>(inType, input_shapes[0]);
|
||||
auto data1 = std::make_shared<op::v0::Parameter>(outType, input_shapes[1]);
|
||||
auto stub0 = createRollAsStub(data0);
|
||||
auto stub1 = createRollAsStub(data1);
|
||||
auto convert = std::make_shared<op::v0::Convert>(stub0, outType);
|
||||
auto add = std::make_shared<op::v1::Add>(convert, stub1);
|
||||
return std::make_shared<ov::Model>(NodeVector{add}, ParameterVector{data0, data1});
|
||||
}
|
||||
std::shared_ptr<ov::Model> ConvertInputFunction::initReference() const {
|
||||
auto data0 = std::make_shared<op::v0::Parameter>(inType, input_shapes[0]);
|
||||
auto data1 = std::make_shared<op::v0::Parameter>(outType, input_shapes[1]);
|
||||
auto stub0 = createRollAsStub(data0);
|
||||
auto stub1 = createRollAsStub(data1);
|
||||
auto indata0 = std::make_shared<op::v0::Parameter>(inType, stub0->get_shape());
|
||||
auto indata1 = std::make_shared<op::v0::Parameter>(outType, stub1->get_shape());
|
||||
auto convert = std::make_shared<ngraph::snippets::op::ConvertTruncation>(indata0, outType);
|
||||
auto subgraph = std::make_shared<ngraph::snippets::op::Subgraph>(NodeVector{stub0, stub1},
|
||||
std::make_shared<ov::Model>(
|
||||
NodeVector{std::make_shared<op::v1::Add>(convert, indata1)},
|
||||
ParameterVector{indata0, indata1}));
|
||||
return std::make_shared<ov::Model>(NodeVector{subgraph}, ParameterVector{data0, data1});
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Model> ConvertOutputFunction::initOriginal() const {
|
||||
auto data0 = std::make_shared<op::v0::Parameter>(inType, input_shapes[0]);
|
||||
auto data1 = std::make_shared<op::v0::Parameter>(inType, input_shapes[1]);
|
||||
auto stub0 = createRollAsStub(data0);
|
||||
auto stub1 = createRollAsStub(data1);
|
||||
auto add = std::make_shared<op::v1::Add>(stub0, stub1);
|
||||
auto convert = std::make_shared<op::v0::Convert>(add, outType);
|
||||
return std::make_shared<ov::Model>(NodeVector{convert}, ParameterVector{data0, data1});
|
||||
}
|
||||
std::shared_ptr<ov::Model> ConvertOutputFunction::initReference() const {
|
||||
auto data0 = std::make_shared<op::v0::Parameter>(inType, input_shapes[0]);
|
||||
auto data1 = std::make_shared<op::v0::Parameter>(inType, input_shapes[1]);
|
||||
auto stub0 = createRollAsStub(data0);
|
||||
auto stub1 = createRollAsStub(data1);
|
||||
auto indata0 = std::make_shared<op::v0::Parameter>(inType, stub0->get_shape());
|
||||
auto indata1 = std::make_shared<op::v0::Parameter>(inType, stub1->get_shape());
|
||||
auto add = std::make_shared<op::v1::Add>(indata0, indata1);
|
||||
auto convert = std::make_shared<ngraph::snippets::op::ConvertTruncation>(add, outType);
|
||||
auto subgraph = std::make_shared<ngraph::snippets::op::Subgraph>(NodeVector{stub0, stub1},
|
||||
std::make_shared<ov::Model>(
|
||||
NodeVector{convert},
|
||||
ParameterVector{indata0, indata1}));
|
||||
return std::make_shared<ov::Model>(NodeVector{subgraph}, ParameterVector{data0, data1});
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Model> ConvertStubFunction::initOriginal() const {
|
||||
auto data0 = std::make_shared<op::v0::Parameter>(inType, input_shapes[0]);
|
||||
auto data1 = std::make_shared<op::v0::Parameter>(inType, input_shapes[1]);
|
||||
auto stub0 = createRollAsStub(data0);
|
||||
auto stub1 = createRollAsStub(data1);
|
||||
auto add = std::make_shared<op::v1::Add>(stub0, stub1);
|
||||
auto convert = std::make_shared<op::v0::Convert>(add, outType);
|
||||
auto relu = std::make_shared<op::v0::Relu>(convert);
|
||||
return std::make_shared<ov::Model>(NodeVector{relu}, ParameterVector{data0, data1});
|
||||
}
|
||||
std::shared_ptr<ov::Model> ConvertStubFunction::initReference() const {
|
||||
auto data0 = std::make_shared<op::v0::Parameter>(inType, input_shapes[0]);
|
||||
auto data1 = std::make_shared<op::v0::Parameter>(inType, input_shapes[1]);
|
||||
auto stub0 = createRollAsStub(data0);
|
||||
auto stub1 = createRollAsStub(data1);
|
||||
auto indata0 = std::make_shared<op::v0::Parameter>(inType, stub0->get_shape());
|
||||
auto indata1 = std::make_shared<op::v0::Parameter>(inType, stub1->get_shape());
|
||||
auto add = std::make_shared<op::v1::Add>(indata0, indata1);
|
||||
auto convert = std::make_shared<ngraph::snippets::op::ConvertTruncation>(add, outType);
|
||||
auto subgraph0 = std::make_shared<ngraph::snippets::op::Subgraph>(
|
||||
NodeVector{stub0, stub1}, std::make_shared<ov::Model>(NodeVector{convert}, ParameterVector{indata0, indata1}));
|
||||
auto indata2 = std::make_shared<op::v0::Parameter>(convert->get_destination_type(), convert->get_shape());
|
||||
auto relu = std::make_shared<op::v0::Relu>(indata2);
|
||||
auto subgraph1 = std::make_shared<ngraph::snippets::op::Subgraph>(
|
||||
NodeVector{subgraph0}, std::make_shared<ov::Model>(NodeVector{relu}, ParameterVector{indata2}));
|
||||
return std::make_shared<ov::Model>(NodeVector{subgraph1}, ParameterVector{data0, data1});
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Model> ConvertPartialInputsAndResultsFunction::initOriginal() const {
|
||||
auto data0 = std::make_shared<op::v0::Parameter>(inTypes[0], input_shapes[0]);
|
||||
auto data1 = std::make_shared<op::v0::Parameter>(inTypes[1], input_shapes[1]);
|
||||
auto data2 = std::make_shared<op::v0::Parameter>(inTypes[2], input_shapes[2]);
|
||||
auto stub0 = createRollAsStub(data0);
|
||||
auto stub1 = createRollAsStub(data1);
|
||||
auto stub2 = createRollAsStub(data2);
|
||||
auto convert0 = std::make_shared<op::v0::Convert>(stub0, outTypes[0]);
|
||||
auto convert1 = std::make_shared<op::v0::Convert>(stub1, outTypes[0]);
|
||||
auto add = std::make_shared<op::v1::Add>(convert0, convert1);
|
||||
auto relu = std::make_shared<op::v0::Relu>(add);
|
||||
auto sub = std::make_shared<op::v1::Subtract>(relu, stub2);
|
||||
auto stub3 = createRollAsStub(sub);
|
||||
auto convert2 = std::make_shared<op::v0::Convert>(relu, outTypes[1]);
|
||||
return std::make_shared<ov::Model>(NodeVector{convert2, stub3}, ParameterVector{data0, data1, data2});
|
||||
}
|
||||
std::shared_ptr<ov::Model> ConvertPartialInputsAndResultsFunction::initReference() const {
|
||||
auto data0 = std::make_shared<op::v0::Parameter>(inTypes[0], input_shapes[0]);
|
||||
auto data1 = std::make_shared<op::v0::Parameter>(inTypes[1], input_shapes[1]);
|
||||
auto data2 = std::make_shared<op::v0::Parameter>(inTypes[2], input_shapes[2]);
|
||||
auto stub0 = createRollAsStub(data0);
|
||||
auto stub1 = createRollAsStub(data1);
|
||||
auto stub2 = createRollAsStub(data2);
|
||||
auto indata0 = std::make_shared<op::v0::Parameter>(inTypes[0], stub0->get_shape());
|
||||
auto indata1 = std::make_shared<op::v0::Parameter>(inTypes[1], stub1->get_shape());
|
||||
auto indata2 = std::make_shared<op::v0::Parameter>(inTypes[2], stub2->get_shape());
|
||||
auto convert0 = std::make_shared<ngraph::snippets::op::ConvertTruncation>(indata0, outTypes[0]);
|
||||
auto convert1 = std::make_shared<ngraph::snippets::op::ConvertTruncation>(indata1, outTypes[0]);
|
||||
auto add = std::make_shared<op::v1::Add>(convert0, convert1);
|
||||
auto relu = std::make_shared<op::v0::Relu>(add);
|
||||
auto sub = std::make_shared<op::v1::Subtract>(relu, indata2);
|
||||
auto convert2 = std::make_shared<ngraph::snippets::op::ConvertTruncation>(relu, outTypes[1]);
|
||||
auto subgraph = std::make_shared<ngraph::snippets::op::Subgraph>(
|
||||
NodeVector{stub0, stub1, stub2}, std::make_shared<ov::Model>(NodeVector{sub, convert2}, ParameterVector{indata0, indata1, indata2}));
|
||||
auto stub3 = createRollAsStub(subgraph);
|
||||
return std::make_shared<ov::Model>(OutputVector{subgraph->output(1), stub3->output(0)},
|
||||
ParameterVector{data0, data1, data2});
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Model> ConvertManyOnInputsFunction::initOriginal() const {
|
||||
auto data0 = std::make_shared<op::v0::Parameter>(types[0], input_shapes[0]);
|
||||
auto stub0 = createRollAsStub(data0);
|
||||
std::shared_ptr<ov::Node> out = stub0;
|
||||
for (auto i = 1; i < types.size(); i++) {
|
||||
auto convert = std::make_shared<op::v0::Convert>(out, types[i]);
|
||||
out = convert;
|
||||
}
|
||||
auto relu = std::make_shared<op::v0::Relu>(out);
|
||||
return std::make_shared<ov::Model>(NodeVector{relu}, ParameterVector{data0});
|
||||
}
|
||||
std::shared_ptr<ov::Model> ConvertManyOnInputsFunction::initReference() const {
|
||||
auto data0 = std::make_shared<op::v0::Parameter>(types[0], input_shapes[0]);
|
||||
auto stub0 = createRollAsStub(data0);
|
||||
auto indata0 = std::make_shared<op::v0::Parameter>(types[0], stub0->get_shape());
|
||||
std::shared_ptr<ov::Node> out = indata0;
|
||||
for (auto i = 1; i < types.size(); i++) {
|
||||
auto convert = std::make_shared<ngraph::snippets::op::ConvertTruncation>(out, types[i]);
|
||||
out = convert;
|
||||
}
|
||||
auto relu = std::make_shared<op::v0::Relu>(out);
|
||||
auto subgraph = std::make_shared<ngraph::snippets::op::Subgraph>(NodeVector{stub0},
|
||||
std::make_shared<ov::Model>(NodeVector{relu}, ParameterVector{indata0}));
|
||||
return std::make_shared<ov::Model>(NodeVector{subgraph}, ParameterVector{data0});
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Model> ConvertManyOnOutputsFunction::initOriginal() const {
|
||||
auto data0 = std::make_shared<op::v0::Parameter>(types[0], input_shapes[0]);
|
||||
auto stub0 = std::make_shared<ov::op::v0::Sinh>(data0);
|
||||
auto relu = std::make_shared<op::v0::Relu>(stub0);
|
||||
std::shared_ptr<ov::Node> out = relu;
|
||||
for (auto i = 1; i < types.size(); i++) {
|
||||
auto convert = std::make_shared<op::v0::Convert>(out, types[i]);
|
||||
out = convert;
|
||||
}
|
||||
return std::make_shared<ov::Model>(NodeVector{out}, ParameterVector{data0});
|
||||
}
|
||||
std::shared_ptr<ov::Model> ConvertManyOnOutputsFunction::initReference() const {
|
||||
auto data0 = std::make_shared<op::v0::Parameter>(types[0], input_shapes[0]);
|
||||
auto stub0 = std::make_shared<ov::op::v0::Sinh>(data0);
|
||||
auto indata0 = std::make_shared<op::v0::Parameter>(types[0], stub0->get_shape());
|
||||
auto relu = std::make_shared<op::v0::Relu>(indata0);
|
||||
std::shared_ptr<ov::Node> out = relu;
|
||||
for (auto i = 1; i < types.size(); i++) {
|
||||
auto convert = std::make_shared<ngraph::snippets::op::ConvertTruncation>(out, types[i]);
|
||||
out = convert;
|
||||
}
|
||||
auto subgraph = std::make_shared<ngraph::snippets::op::Subgraph>(NodeVector{stub0},
|
||||
std::make_shared<ov::Model>(NodeVector{out}, ParameterVector{indata0}));
|
||||
return std::make_shared<ov::Model>(NodeVector{subgraph}, ParameterVector{data0});
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Model> ConvertManyOnInputOutputFunction::initOriginal() const {
|
||||
auto data0 = std::make_shared<op::v0::Parameter>(inTypes[0], input_shapes[0]);
|
||||
auto stub0 = std::make_shared<ov::op::v0::Sinh>(data0);
|
||||
std::shared_ptr<ov::Node> out = stub0;
|
||||
for (auto i = 1; i < inTypes.size(); i++) {
|
||||
auto convert = std::make_shared<op::v0::Convert>(out, inTypes[i]);
|
||||
out = convert;
|
||||
}
|
||||
auto relu = std::make_shared<op::v0::Relu>(stub0);
|
||||
out = relu;
|
||||
for (auto i = 0; i < outTypes.size(); i++) {
|
||||
auto convert = std::make_shared<op::v0::Convert>(out, outTypes[i]);
|
||||
out = convert;
|
||||
}
|
||||
return std::make_shared<ov::Model>(NodeVector{out}, ParameterVector{data0});
|
||||
}
|
||||
std::shared_ptr<ov::Model> ConvertManyOnInputOutputFunction::initReference() const {
|
||||
auto data0 = std::make_shared<op::v0::Parameter>(inTypes[0], input_shapes[0]);
|
||||
auto stub0 = std::make_shared<ov::op::v0::Sinh>(data0);
|
||||
auto indata0 = std::make_shared<op::v0::Parameter>(inTypes[0], stub0->get_shape());
|
||||
std::shared_ptr<ov::Node> out = indata0;
|
||||
for (auto i = 1; i < inTypes.size(); i++) {
|
||||
auto convert = std::make_shared<op::v0::Convert>(out, inTypes[i]);
|
||||
out = convert;
|
||||
}
|
||||
auto relu = std::make_shared<op::v0::Relu>(stub0);
|
||||
out = relu;
|
||||
for (auto i = 0; i < outTypes.size(); i++) {
|
||||
auto convert = std::make_shared<op::v0::Convert>(out, outTypes[i]);
|
||||
out = convert;
|
||||
}
|
||||
auto subgraph = std::make_shared<ngraph::snippets::op::Subgraph>(NodeVector{stub0},
|
||||
std::make_shared<ov::Model>(NodeVector{out}, ParameterVector{indata0}));
|
||||
return std::make_shared<ov::Model>(NodeVector{subgraph}, ParameterVector{data0});
|
||||
}
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
@ -46,6 +46,14 @@ std::shared_ptr<ov::Model> AddSinhFunction::initReference() const {
|
||||
ParameterVector{indata0, indata1}));
|
||||
return std::make_shared<ov::Model>(NodeVector{add}, ParameterVector{data0, data1});
|
||||
}
|
||||
std::shared_ptr<ov::Model> AddSinhConstFunction::initOriginal() const {
|
||||
auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
|
||||
const std::vector<float> const_values = CommonTestUtils::generate_float_numbers(shape_size(input_shapes[0]), -10., 10.);
|
||||
auto const_data1 = std::make_shared<op::v0::Constant>(precision, input_shapes[0], const_values);
|
||||
auto sin0 = std::make_shared<ov::op::v0::Sinh>(data0);
|
||||
auto add = std::make_shared<op::v1::Add>(sin0, const_data1);
|
||||
return std::make_shared<ov::Model>(NodeVector{add}, ParameterVector{data0});
|
||||
}
|
||||
std::shared_ptr<ov::Model> EltwiseFunction::initOriginal() const {
|
||||
auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
|
||||
auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
|
||||
@ -98,6 +106,28 @@ std::shared_ptr<ov::Model> EltwiseThreeInputsSinhFunction::initOriginal() const
|
||||
auto mul = std::make_shared<op::v1::Multiply>(add, sub);
|
||||
return std::make_shared<ov::Model>(NodeVector{mul}, ParameterVector{data0, data1, data2});
|
||||
}
|
||||
std::shared_ptr<ov::Model> EltwiseMaxNumParamsSinhFunction::initOriginal() const {
|
||||
ParameterVector params;
|
||||
std::vector<std::shared_ptr<Node>> sinh; // 10
|
||||
for (const auto& shape : input_shapes) {
|
||||
auto param = std::make_shared<op::v0::Parameter>(precision, shape);
|
||||
params.push_back(param);
|
||||
sinh.push_back(std::make_shared<op::v0::Sinh>(param));
|
||||
}
|
||||
std::vector<std::shared_ptr<Node>> add; // 5
|
||||
for (size_t i = 0; i < input_shapes.size() / 2; i++) {
|
||||
add.push_back(std::make_shared<op::v1::Add>(sinh[i * 2], sinh[i * 2 + 1]));
|
||||
}
|
||||
std::vector<std::shared_ptr<Node>> mul; // 2
|
||||
for (size_t i = 0; i < add.size() / 2; i++) {
|
||||
auto mul_node = std::make_shared<op::v1::Multiply>(add[i * 2], add[i * 2 + 1]);
|
||||
mul.push_back(mul_node);
|
||||
}
|
||||
auto sub = std::make_shared<op::v1::Subtract>(mul[0], mul[1]);
|
||||
auto power = std::make_shared<op::v1::Power>(add.back(), sub);
|
||||
auto exit_sinh = std::make_shared<op::v0::Sinh>(power);
|
||||
return std::make_shared<ov::Model>(NodeVector{sub, exit_sinh}, params);
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Model> MatMulEltwiseBranchesFunction::initOriginal() const {
|
||||
auto data_1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
|
||||
@ -187,6 +217,69 @@ std::shared_ptr<ov::Model> EltwiseLogLoopFunction::initReference() const {
|
||||
return std::make_shared<Model>(NodeVector{mul}, ParameterVector{data0, data1});
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Model> EltwiseTwoResultsFunction::initOriginal() const {
|
||||
auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
|
||||
auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
|
||||
auto sinh0 = std::make_shared<op::v0::Sinh>(data0);
|
||||
auto sinh1 = std::make_shared<op::v0::Sinh>(data1);
|
||||
auto add = std::make_shared<op::v1::Add>(sinh0, sinh1);
|
||||
auto hswish = std::make_shared<op::v4::HSwish>(add);
|
||||
auto relu = std::make_shared<op::v0::Relu>(hswish);
|
||||
|
||||
NGRAPH_SUPPRESS_DEPRECATED_START
|
||||
auto& out_tensor0 = add->get_output_tensor(0);
|
||||
out_tensor0.set_name("add_out");
|
||||
out_tensor0.set_names({"add_out", "y0"});
|
||||
|
||||
auto& out_tensor1 = relu->get_output_tensor(0);
|
||||
out_tensor1.set_name("relu_out");
|
||||
out_tensor1.set_names({"relu_out", "y1"});
|
||||
NGRAPH_SUPPRESS_DEPRECATED_END
|
||||
|
||||
return std::make_shared<Model>(NodeVector{add, relu}, ParameterVector{data0, data1});
|
||||
}
|
||||
std::shared_ptr<ov::Model> EltwiseTwoResultsFunction::initReference() const {
|
||||
auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
|
||||
auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
|
||||
auto sinh0 = std::make_shared<op::v0::Sinh>(data0);
|
||||
auto sinh1 = std::make_shared<op::v0::Sinh>(data1);
|
||||
auto indata0 = std::make_shared<op::v0::Parameter>(precision, sinh0->get_shape());
|
||||
auto indata1 = std::make_shared<op::v0::Parameter>(precision, sinh1->get_shape());
|
||||
auto add = std::make_shared<op::v1::Add>(indata0, indata1);
|
||||
auto hswish = std::make_shared<op::v4::HSwish>(add);
|
||||
auto subgraph0 = std::make_shared<ngraph::snippets::op::Subgraph>(NodeVector{sinh0, sinh1},
|
||||
std::make_shared<ov::Model>(NodeVector{add, hswish},
|
||||
ParameterVector{indata0, indata1}));
|
||||
auto indata2 = std::make_shared<op::v0::Parameter>(precision, subgraph0->get_output_shape(1));
|
||||
auto relu = std::make_shared<op::v0::Relu>(indata2);
|
||||
auto subgraph1 = std::make_shared<ngraph::snippets::op::Subgraph>(OutputVector{subgraph0->output(1)},
|
||||
std::make_shared<ov::Model>(NodeVector{relu},
|
||||
ParameterVector{indata2}));
|
||||
NGRAPH_SUPPRESS_DEPRECATED_START
|
||||
auto& out_tensor0 = subgraph0->get_output_tensor(0);
|
||||
out_tensor0.set_name("add_out");
|
||||
out_tensor0.set_names({"add_out", "y0"});
|
||||
|
||||
auto& out_tensor1 = subgraph1->get_output_tensor(0);
|
||||
out_tensor1.set_name("relu_out");
|
||||
out_tensor1.set_names({"relu_out", "y1"});
|
||||
NGRAPH_SUPPRESS_DEPRECATED_END
|
||||
return std::make_shared<Model>(OutputVector{subgraph0->output(0), subgraph1->output(0)}, ParameterVector{data0, data1});
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Model> TwoInputsAndOutputsFunction::initOriginal() const {
|
||||
auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
|
||||
auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
|
||||
auto sin0 = std::make_shared<op::v0::Sin>(data0);
|
||||
auto sin1 = std::make_shared<op::v0::Sin>(data1);
|
||||
auto hswish = std::make_shared<op::v4::HSwish>(sin0);
|
||||
auto add = std::make_shared<op::v1::Add>(hswish, sin1);
|
||||
auto relu = std::make_shared<op::v0::Relu>(add);
|
||||
auto sin3 = std::make_shared<op::v0::Sin>(relu);
|
||||
|
||||
return std::make_shared<Model>(NodeVector{hswish, sin3}, ParameterVector{data0, data1});
|
||||
}
|
||||
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
Loading…
Reference in New Issue
Block a user