[CPU] Snippets throughput mode fixes (#9488)
This commit is contained in:
@@ -16,29 +16,24 @@ namespace op {
|
||||
* @brief Generated by Canonicalization for a scalar constant Shape() == {1}
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class Scalar : public ngraph::op::Constant {
|
||||
class Scalar : public ov::op::v0::Constant {
|
||||
public:
|
||||
OPENVINO_OP("Scalar", "SnippetsOpset", ngraph::op::Constant);
|
||||
OPENVINO_OP("Scalar", "SnippetsOpset", ov::op::v0::Constant);
|
||||
|
||||
Scalar() = default;
|
||||
Scalar(const std::shared_ptr<runtime::Tensor>& tensor) : Constant(tensor) {}
|
||||
template <typename T>
|
||||
Scalar(const element::Type& type, Shape shape, const std::vector<T>& values) : Constant(type, shape, values) {}
|
||||
Scalar(const element::Type& type, const Shape& shape) : Constant(type, shape) {}
|
||||
template <class T, class = typename std::enable_if<std::is_fundamental<T>::value>::type>
|
||||
Scalar(const element::Type& type, Shape shape, T value) : Constant(type, shape, value) {}
|
||||
Scalar(const element::Type& type, Shape shape, const std::vector<std::string>& values) : Constant(type, shape, values) {}
|
||||
Scalar(const element::Type& type, const Shape& shape, const void* data) : Constant(type, shape, data) {}
|
||||
|
||||
Scalar(const Constant& other) : Constant(other) {}
|
||||
Scalar(const Scalar& other) : Constant(other) {}
|
||||
Scalar& operator=(const Scalar&) = delete;
|
||||
~Scalar() override {}
|
||||
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override {
|
||||
check_new_args_count(this, new_args);
|
||||
return std::make_shared<Scalar>(*this);
|
||||
Scalar(const element::Type& type, Shape shape, T value) : Constant(type, shape, value) {
|
||||
constructor_validate_and_infer_types();
|
||||
}
|
||||
explicit Scalar(const Constant& other) : Constant(other) {
|
||||
constructor_validate_and_infer_types();
|
||||
}
|
||||
Scalar(const Scalar& other) : Constant(other) {
|
||||
constructor_validate_and_infer_types();
|
||||
}
|
||||
Scalar& operator=(const Scalar&) = delete;
|
||||
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
|
||||
void validate_and_infer_types() override;
|
||||
};
|
||||
|
||||
} // namespace op
|
||||
|
||||
@@ -88,14 +88,14 @@ public:
|
||||
return m_generator;
|
||||
}
|
||||
|
||||
std::shared_ptr<Subgraph> make_canonical_from_this();
|
||||
|
||||
snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes,
|
||||
ngraph::pass::Manager opt = ngraph::pass::Manager(), const void* compile_params = nullptr);
|
||||
ngraph::pass::Manager& opt, const void* compile_params = nullptr);
|
||||
snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes,
|
||||
const void* compile_params = nullptr);
|
||||
/// Set a new body for the op; body needs to satisfy requirements on inputs/outputs
|
||||
void set_body(std::shared_ptr<ov::Model> body);
|
||||
snippets::Schedule generate(ngraph::pass::Manager &opt, const void* compile_params = nullptr);
|
||||
snippets::Schedule generate(const void* compile_params = nullptr);
|
||||
Shape canonicalize(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes);
|
||||
|
||||
// plugin sets generator for a snippet to some specific generator.
|
||||
// it's going to be replaced with Jitters table later
|
||||
@@ -109,9 +109,8 @@ public:
|
||||
static auto wrap_node_as_subgraph(const std::shared_ptr<ngraph::Node>& node) -> std::shared_ptr<Subgraph>;
|
||||
|
||||
private:
|
||||
void canonicalize(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes);
|
||||
void convert_to_snippet_dialect();
|
||||
|
||||
Shape exec_domain;
|
||||
std::shared_ptr<ov::Model> m_body;
|
||||
std::shared_ptr<ngraph::snippets::Generator> m_generator;
|
||||
};
|
||||
|
||||
@@ -0,0 +1,27 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ngraph/pass/graph_rewrite.hpp>
|
||||
#include <ngraph/pattern/matcher.hpp>
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace pass {
|
||||
|
||||
/**
|
||||
* @interface ConvertConstantsToScalars
|
||||
* @brief Replace only constants which are should be represented as scalars during code generation.
|
||||
* Only single-value (0D) constants are currently supported.
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class ConvertConstantsToScalars: public ngraph::pass::MatcherPass {
|
||||
public:
|
||||
ConvertConstantsToScalars();
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
||||
@@ -0,0 +1,26 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ngraph/pass/graph_rewrite.hpp>
|
||||
#include <ngraph/pattern/matcher.hpp>
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace pass {
|
||||
|
||||
/**
|
||||
* @interface ConvertConstantsToScalars
|
||||
* @brief Replace Power with a scalar input with snippets::op::PowerStatic for generation of a more optimal code.
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class ConvertPowerToPowerStatic: public ngraph::pass::MatcherPass {
|
||||
public:
|
||||
ConvertPowerToPowerStatic();
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
||||
22
src/common/snippets/src/op/scalar.cpp
Normal file
22
src/common/snippets/src/op/scalar.cpp
Normal file
@@ -0,0 +1,22 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/op/scalar.hpp"
|
||||
|
||||
using namespace ngraph;
|
||||
|
||||
std::shared_ptr<Node> snippets::op::Scalar::clone_with_new_inputs(const OutputVector& new_args) const {
|
||||
check_new_args_count(this, new_args);
|
||||
return std::make_shared<Scalar>(*this);
|
||||
}
|
||||
|
||||
// Scalar currently supports only one-element constants, this could be changed in the future
|
||||
void snippets::op::Scalar::validate_and_infer_types() {
|
||||
Constant::validate_and_infer_types();
|
||||
auto out_pshape = get_output_partial_shape(0);
|
||||
NODE_VALIDATION_CHECK(this, out_pshape.is_static(), "Scalar supports only static input shapes");
|
||||
NODE_VALIDATION_CHECK(this, out_pshape.get_shape().empty() || ov::shape_size(out_pshape.get_shape()) == 1,
|
||||
"Scalar supports only one-element constants, got ", out_pshape.get_shape(),
|
||||
" shape");
|
||||
}
|
||||
@@ -10,6 +10,8 @@
|
||||
#include "snippets/pass/insert_movebroadcast.hpp"
|
||||
#include "snippets/pass/load_movebroadcast_to_broadcastload.hpp"
|
||||
#include "snippets/pass/assign_registers.hpp"
|
||||
#include "snippets/pass/convert_constants_to_scalars.hpp"
|
||||
#include "snippets/pass/convert_power_to_powerstatic.hpp"
|
||||
|
||||
#include <ngraph/pass/manager.hpp>
|
||||
#include <openvino/pass/serialize.hpp>
|
||||
@@ -115,79 +117,102 @@ auto snippets::op::Subgraph::wrap_node_as_subgraph(const std::shared_ptr<ov::Nod
|
||||
|
||||
return subgraph;
|
||||
}
|
||||
|
||||
std::shared_ptr<snippets::op::Subgraph> snippets::op::Subgraph::make_canonical_from_this() {
|
||||
INTERNAL_OP_SCOPE(Subgraph);
|
||||
ngraph::OutputVector subgraph_node_inputs;
|
||||
for (auto input : this->input_values()) {
|
||||
subgraph_node_inputs.push_back(input);
|
||||
}
|
||||
auto new_body = ov::clone_model(*this->get_body().get());
|
||||
auto snippet = std::make_shared<op::Subgraph>(subgraph_node_inputs, new_body);
|
||||
ngraph::copy_runtime_info(this->shared_from_this(), snippet);
|
||||
snippet->set_friendly_name(this->get_friendly_name());
|
||||
snippet->set_generator(this->m_generator);
|
||||
|
||||
return snippet;
|
||||
}
|
||||
|
||||
// We also can think of canonization as of pass to copy original subgraph and transforming it to canonical form suitable for code generation
|
||||
// pass actual parameters and results shapes to generate for as well as channel mapping,
|
||||
// Todo: we need to distinguish between 5d tensors that represents <N, C, H, W, c> and <N, C, D, H, W> somehow like locked dimensions
|
||||
// ngraph::AxisVector to code
|
||||
void snippets::op::Subgraph::canonicalize(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes) {
|
||||
///
|
||||
/// \brief Canonization transforms original subgraph and to canonical form suitable for code generation. In particular,
|
||||
/// it handles supported layout conversions, broadcasts inputs and outputs to a single rank and layout. Canonicalization
|
||||
/// returns master-shape (max rank + max dimensions over all outputs) that can be used for scheduling.
|
||||
/// Canonicalization currently supports only the following layout conversions:
|
||||
/// * None: all inputs have the same layout
|
||||
/// * Planar + blocked: some inputs have blocked, and some have planar layouts, e.g. <N, C, H, W, c> + <N, C, H, W>
|
||||
Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes) {
|
||||
INTERNAL_OP_SCOPE(Subgraph);
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::canonicalize")
|
||||
NODE_VALIDATION_CHECK(this, input_shapes.size() == m_body->get_parameters().size(),
|
||||
"Number of parameters for snippet doesn't match passed to generate method: ", input_shapes.size(), " vs ", m_body->get_parameters().size(), ".");
|
||||
NODE_VALIDATION_CHECK(this, inputShapes.size() == m_body->get_parameters().size(),
|
||||
"Number of parameters for snippet doesn't match passed to generate method: ", inputShapes.size(), " vs ", m_body->get_parameters().size(), ".");
|
||||
|
||||
NODE_VALIDATION_CHECK(this, output_shapes.size() == m_body->get_results().size(),
|
||||
"number of results for snippet doesn't match passed to generate method: ", output_shapes.size(), " vs ", m_body->get_results().size(), ".");
|
||||
|
||||
// replace only constants which are actually should be represented as scalars during code generation and probably move this step a bit later
|
||||
for (auto op : m_body->get_ordered_ops()) {
|
||||
if (auto constant = ngraph::as_type_ptr<opset1::Constant>(op)) {
|
||||
auto scalar = std::make_shared<snippets::op::Scalar>(*constant);
|
||||
scalar->set_friendly_name(constant->get_friendly_name());
|
||||
ngraph::copy_runtime_info(constant, scalar);
|
||||
ngraph::replace_node(constant, scalar);
|
||||
}
|
||||
NODE_VALIDATION_CHECK(this, outputShapes.size() == m_body->get_results().size(),
|
||||
"number of results for snippet doesn't match passed to generate method: ", outputShapes.size(), " vs ", m_body->get_results().size(), ".");
|
||||
// todo: does it allowed to have outputs with different layouts? I assume no, remove if invalid
|
||||
const AxisVector outOrder = get<1>(outputShapes[0]);
|
||||
for (size_t i = 1; i < outputShapes.size(); i++) {
|
||||
const AxisVector order_i = get<1>(outputShapes[i]);
|
||||
NODE_VALIDATION_CHECK(this, outOrder.size() == order_i.size() && equal(outOrder.begin(), outOrder.end(), order_i.begin()),
|
||||
"Snippets output shapes must have the same layout");
|
||||
}
|
||||
|
||||
|
||||
|
||||
// it should be in subgraph node to be aligned with internal and external parameter list, but adding this for testing
|
||||
// TODO: store blocking into to Parameter's rt_info for future propagation
|
||||
for (size_t i = 0; i < m_body->get_parameters().size(); i++) {
|
||||
auto param = m_body->get_parameters()[i];
|
||||
if (param->get_shape().size() < 4) {
|
||||
std::vector<size_t> shape(4, 1);
|
||||
std::copy(param->get_shape().begin(), param->get_shape().end(), &shape.at(4 - (param->get_shape().size() == 0 ? 1 : param->get_shape().size())) );
|
||||
m_body->replace_parameter(i, std::make_shared<opset1::Parameter>(param->get_element_type(), ngraph::Shape(shape)));
|
||||
} else if (param->get_shape().size() >= 4) {
|
||||
if (param->get_element_type() != std::get<2>(input_shapes[i])) {
|
||||
throw ngraph::ngraph_error("changes in presision. Is it legal??");
|
||||
auto getMaxRankBlockedShape = [](const BlockedShapeVector& blockedShapes) -> const BlockedShape& {
|
||||
return *std::max_element(blockedShapes.begin(), blockedShapes.end(),
|
||||
[&](const BlockedShape& lhs, const BlockedShape& rhs) {
|
||||
return std::get<0>(lhs).size() < std::get<0>(rhs).size();
|
||||
});
|
||||
};
|
||||
Shape baseShape;
|
||||
AxisVector baseOrder;
|
||||
std::tie(baseShape, baseOrder, std::ignore) = getMaxRankBlockedShape(inputShapes);
|
||||
const auto baseRank = baseShape.size();
|
||||
const bool baseIsBlocked = baseOrder.size() != std::set<size_t>(baseOrder.begin(), baseOrder.end()).size();
|
||||
for (size_t i = 0; i < inputShapes.size(); i++) {
|
||||
const auto &blockedShape = inputShapes[i];
|
||||
Shape inShape;
|
||||
AxisVector inOrder;
|
||||
element::Type inType;
|
||||
std::tie(inShape, inOrder, inType) = blockedShape;
|
||||
const auto inRank = inShape.size();
|
||||
NODE_VALIDATION_CHECK(this, inRank <= baseRank, "Input rank can't be larger than output rank in snippets.");
|
||||
if (inRank < baseRank) {
|
||||
Shape newShape(baseRank, 1);
|
||||
// todo: more complicated logics is needed if we want to merge smth else than blocked and planar
|
||||
// could be done by PartialShape::broadcast_merge_into, but this way is faster
|
||||
size_t startOffset = baseRank - inRank;
|
||||
if (baseIsBlocked) {
|
||||
const bool inIsNotBlocked = inOrder.size() == std::set<size_t>(inOrder.begin(), inOrder.end()).size();
|
||||
NODE_VALIDATION_CHECK(this, inIsNotBlocked, "Snippets don't support conversion between blocked layouts of different ranks");
|
||||
startOffset--;
|
||||
}
|
||||
m_body->replace_parameter(i, std::make_shared<opset1::Parameter>(std::get<2>(input_shapes[i]), std::get<0>(input_shapes[i])));
|
||||
std::copy(inShape.begin(), inShape.end(), &newShape[startOffset]);
|
||||
inShape = move(newShape);
|
||||
} else {
|
||||
// todo: 4d blocked + 5d planar layouts are not supported: <N, C, H, W, c> + <N, C, D, H, W>
|
||||
NODE_VALIDATION_CHECK(this,
|
||||
equal(baseOrder.begin(), baseOrder.end(), inOrder.begin()),
|
||||
"Snippets canonicalization got input shapes of equal ranks but different layouts, which is not supported");
|
||||
}
|
||||
ov::PartialShape tmpPShape(baseShape);
|
||||
NODE_VALIDATION_CHECK(this,
|
||||
PartialShape::broadcast_merge_into(tmpPShape, inShape, ::ngraph::op::AutoBroadcastType::NUMPY),
|
||||
"Failed to create broadcastable shapes in snippets canonicalization");
|
||||
const auto paramShape = m_body->get_parameters()[i]->get_shape();
|
||||
if (paramShape.size() != inShape.size() || !equal(paramShape.begin(), paramShape.end(), inShape.begin()))
|
||||
m_body->replace_parameter(i, std::make_shared<opset1::Parameter>(inType, inShape));
|
||||
}
|
||||
|
||||
m_body->validate_nodes_and_infer_types();
|
||||
|
||||
for (size_t i = 0; i < m_body->get_results().size(); i++) {
|
||||
auto result = m_body->get_results()[i];
|
||||
PartialShape partial(result->get_shape());
|
||||
bool isCompatible = ngraph::PartialShape::broadcast_merge_into(partial, std::get<0>(output_shapes[i]), ::ngraph::op::AutoBroadcastType::NUMPY);
|
||||
// equality check won't pass since we reshape without changes on external snippet edges
|
||||
NODE_VALIDATION_CHECK(this, isCompatible, "Inferend and passed results shapes are difference for snippet : ",
|
||||
result->get_shape(), " vs ", std::get<0>(output_shapes[i]), ".");
|
||||
// Check that output shapes are broadcastable => can be scheduled
|
||||
const auto& body_results = m_body->get_results();
|
||||
PartialShape outPShape = body_results[0]->get_shape();
|
||||
for (size_t i = 0; i < body_results.size(); i++) {
|
||||
auto shape_i = body_results[i]->get_shape();
|
||||
PartialShape pShape_i(shape_i);
|
||||
// Check that the produced output shape corresponds to the passed shape
|
||||
bool compatibleWithPassedShape = PartialShape::broadcast_merge_into(pShape_i, std::get<0>(outputShapes[i]),
|
||||
::ngraph::op::AutoBroadcastType::NUMPY);
|
||||
NODE_VALIDATION_CHECK(this, compatibleWithPassedShape, "Inferred and passed results shapes are difference for snippet : ",
|
||||
shape_i, " vs ", std::get<0>(outputShapes[i]), ".");
|
||||
// Check that output shapes are broadcastable to each other => can be scheduled
|
||||
bool compatibleWithOtherOutputs = PartialShape::broadcast_merge_into(outPShape, shape_i,
|
||||
::ngraph::op::AutoBroadcastType::NUMPY);
|
||||
NODE_VALIDATION_CHECK(this, compatibleWithOtherOutputs, "Snippets output shapes must be numpy broadcastable");
|
||||
}
|
||||
exec_domain = outPShape.get_shape();
|
||||
return exec_domain;
|
||||
}
|
||||
|
||||
void snippets::op::Subgraph::convert_to_snippet_dialect() {
|
||||
INTERNAL_OP_SCOPE(Subgraph);
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::convert_to_snippet_dialect")
|
||||
ngraph::pass::Manager manager;
|
||||
manager.register_pass<snippets::pass::ConvertConstantsToScalars>();
|
||||
manager.register_pass<snippets::pass::ConvertPowerToPowerStatic>();
|
||||
manager.register_pass<snippets::pass::InsertLoad>();
|
||||
manager.register_pass<snippets::pass::InsertStore>();
|
||||
manager.register_pass<snippets::pass::InsertMoveBroadcast>();
|
||||
@@ -198,39 +223,27 @@ void snippets::op::Subgraph::convert_to_snippet_dialect() {
|
||||
snippets::Schedule snippets::op::Subgraph::generate(const BlockedShapeVector& output_shapes,
|
||||
const BlockedShapeVector& input_shapes,
|
||||
const void* compile_params) {
|
||||
return generate(output_shapes, input_shapes, ngraph::pass::Manager(), compile_params);
|
||||
canonicalize(output_shapes, input_shapes);
|
||||
return generate(compile_params);
|
||||
}
|
||||
|
||||
snippets::Schedule snippets::op::Subgraph::generate(const BlockedShapeVector& output_shapes,
|
||||
const BlockedShapeVector& input_shapes,
|
||||
ngraph::pass::Manager opt,
|
||||
ngraph::pass::Manager& opt,
|
||||
const void* compile_params) {
|
||||
canonicalize(output_shapes, input_shapes);
|
||||
return generate(opt, compile_params);
|
||||
}
|
||||
|
||||
snippets::Schedule snippets::op::Subgraph::generate(const void* compile_params) {
|
||||
auto mngr = ngraph::pass::Manager();
|
||||
return generate(mngr, compile_params);
|
||||
}
|
||||
|
||||
snippets::Schedule snippets::op::Subgraph::generate(ngraph::pass::Manager& opt, const void* compile_params) {
|
||||
INTERNAL_OP_SCOPE(Subgraph);
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::generate")
|
||||
NGRAPH_CHECK(m_generator != nullptr, "generate is called while generator is not set");
|
||||
|
||||
canonicalize(output_shapes, input_shapes);
|
||||
|
||||
// Todo: ngraph::pass::Manager introduces appreciable overheads, especially while used on small graphs.
|
||||
// So don't wrap this transformation as a MatcherPass, but rewrite convert_to_snippet_dialect() as a
|
||||
// for loop to improve first-inference time.
|
||||
// replace power with power static
|
||||
|
||||
for (auto op : m_body->get_ordered_ops()) {
|
||||
if (ov::is_type<opset1::Power>(op) &&
|
||||
ov::is_type<snippets::op::Scalar>(op->get_input_node_shared_ptr(1)) &&
|
||||
ov::shape_size(op->get_input_shape(1)) == 1) {
|
||||
auto power = ov::as_type_ptr<opset1::Power>(op);
|
||||
auto scalar = ov::as_type_ptr<snippets::op::Scalar>(op->get_input_node_shared_ptr(1));
|
||||
auto value = scalar->cast_vector<float>()[0];;
|
||||
auto power_static = std::make_shared<snippets::op::PowerStatic>(power->input(0).get_source_output(), value);
|
||||
power_static->set_friendly_name(power->get_friendly_name());
|
||||
ngraph::copy_runtime_info(power, power_static);
|
||||
ngraph::replace_node(power, power_static);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
convert_to_snippet_dialect();
|
||||
opt.run_passes(m_body);
|
||||
|
||||
@@ -253,27 +266,7 @@ snippets::Schedule snippets::op::Subgraph::generate(const BlockedShapeVector& ou
|
||||
}
|
||||
NGRAPH_CHECK(!constants.size(), "External constants detected. Snippet is illigal for scheduling");
|
||||
|
||||
// check resulting shapes are broadcastable to each other so can be scheduled
|
||||
Shape work_size = m_body->output(0).get_shape();
|
||||
for (size_t k = 0; k < m_body->get_output_size(); k++) {
|
||||
auto shape = m_body->output(k).get_shape();
|
||||
|
||||
if (work_size.size() != shape.size()) {
|
||||
throw ngraph_error("rank for all outputs of a snippet should match");
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < work_size.size(); i++) {
|
||||
if (work_size[i] != shape[i]) {
|
||||
if (work_size[i] == 1 || shape[i] == 1) {
|
||||
work_size[i] = max(work_size[i], shape[i]);
|
||||
} else {
|
||||
throw ngraph_error("incompatible shapes for output graphs");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {work_size, false /*canBeLinearized*/, ptr};
|
||||
return {exec_domain, false /*canBeLinearized*/, ptr};
|
||||
}
|
||||
|
||||
void snippets::op::Subgraph::print() const {
|
||||
|
||||
@@ -0,0 +1,28 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <snippets/itt.hpp>
|
||||
#include "snippets/snippets_isa.hpp"
|
||||
#include "snippets/pass/convert_constants_to_scalars.hpp"
|
||||
#include <ngraph/rt_info.hpp>
|
||||
|
||||
|
||||
ngraph::snippets::pass::ConvertConstantsToScalars::ConvertConstantsToScalars() {
|
||||
MATCHER_SCOPE(ConvertConstantsToScalars);
|
||||
auto constants = std::make_shared<pattern::op::Label>(pattern::any_input(),
|
||||
[](std::shared_ptr<Node> n) {
|
||||
return ngraph::is_type<ov::op::v0::Constant>(n);
|
||||
});
|
||||
ngraph::graph_rewrite_callback callback = [this](ngraph::pattern::Matcher &m) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ConvertConstantsToScalars")
|
||||
auto constant = as_type_ptr<ov::op::v0::Constant>(m.get_match_root());
|
||||
auto scalar = std::make_shared<snippets::op::Scalar>(*constant);
|
||||
scalar->set_friendly_name(constant->get_friendly_name());
|
||||
ngraph::copy_runtime_info(constant, scalar);
|
||||
ngraph::replace_node(constant, scalar);
|
||||
|
||||
return true;
|
||||
};
|
||||
register_matcher(std::make_shared<ov::pass::pattern::Matcher>(constants), callback);
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <snippets/itt.hpp>
|
||||
#include "snippets/snippets_isa.hpp"
|
||||
#include "snippets/pass/convert_power_to_powerstatic.hpp"
|
||||
#include <ngraph/rt_info.hpp>
|
||||
|
||||
|
||||
ngraph::snippets::pass::ConvertPowerToPowerStatic::ConvertPowerToPowerStatic() {
|
||||
MATCHER_SCOPE(ConvertPowerToPowerStatic);
|
||||
auto scalarPower = std::make_shared<pattern::op::Label>(pattern::any_input(),
|
||||
[](std::shared_ptr<Node> n) {
|
||||
return is_type<ov::op::v1::Power>(n) &&
|
||||
is_type<snippets::op::Scalar>(n->get_input_node_shared_ptr(1));
|
||||
});
|
||||
ngraph::graph_rewrite_callback callback = [this](ngraph::pattern::Matcher &m) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ConvertConstantsToScalars")
|
||||
auto power = ov::as_type_ptr<ov::op::v1::Power>(m.get_match_root());
|
||||
auto scalar = ov::as_type_ptr<snippets::op::Scalar>(power->get_input_node_shared_ptr(1));
|
||||
auto value = scalar->cast_vector<float>()[0];
|
||||
auto power_static = std::make_shared<snippets::op::PowerStatic>(power->input(0).get_source_output(), value);
|
||||
power_static->set_friendly_name(power->get_friendly_name());
|
||||
ngraph::copy_runtime_info(power, power_static);
|
||||
ngraph::replace_node(power, power_static);
|
||||
|
||||
return true;
|
||||
};
|
||||
register_matcher(std::make_shared<ov::pass::pattern::Matcher>(scalarPower), callback);
|
||||
}
|
||||
@@ -24,7 +24,7 @@ struct jit_snippets_compile_args {
|
||||
int64_t scheduler_dims[SNIPPETS_MAX_TILE_RANK] = {};
|
||||
int64_t scheduler_offsets[SNIPPETS_MAX_SNIPPETS_DIMS] = {};
|
||||
int64_t data_offsets[SNIPPETS_MAX_SNIPPETS_DIMS * SNIPPETS_MAX_HARNESS_DIMS] = {};
|
||||
std::vector<int64_t> output_dims = {};
|
||||
std::vector<size_t> output_dims = {};
|
||||
};
|
||||
///
|
||||
/// \brief Kernel is the only entry point to Codogen Jit compilation. Kernel calculates appropriate data offsets,
|
||||
@@ -361,13 +361,7 @@ class ScalarEmitter : public jit_emitter {
|
||||
public:
|
||||
ScalarEmitter(mkldnn::impl::cpu::x64::jit_generator* h, mkldnn::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
|
||||
: jit_emitter(h, isa, n) {
|
||||
auto out_pshape = n->output(0).get_tensor().get_partial_shape();
|
||||
if (out_pshape.is_dynamic())
|
||||
IE_THROW() << "ScalarEmitter supports only static input shapes";
|
||||
if ( out_pshape.get_shape() != ov::Shape() && ov::shape_size(out_pshape.get_shape()) != 1)
|
||||
IE_THROW() << "ScalarEmitter got invalid shape";
|
||||
value = mkldnn::impl::cpu::x64::float2int(ov::as_type_ptr<ngraph::snippets::op::Scalar>(n)->cast_vector<float>()[0]);
|
||||
|
||||
push_arg_entry_of("scalar", value, true);
|
||||
prepare_table();
|
||||
}
|
||||
|
||||
@@ -58,25 +58,6 @@ void MKLDNNSnippetNode::initSupportedPrimitiveDescriptors() {
|
||||
if (!supportedPrimitiveDescriptors.empty())
|
||||
return;
|
||||
|
||||
auto hasBroadcastByC = [this]() -> bool {
|
||||
for (auto op : ngraph::as_type_ptr<ngraph::snippets::op::Subgraph>(snippet)->get_body()->get_ops()) {
|
||||
if (ngraph::op::supports_auto_broadcast(op)) {
|
||||
auto shape = op->get_input_shape(0);
|
||||
// Filter out scalar empty shape Shape{}
|
||||
if (ngraph::shape_size(shape) != 1) {
|
||||
for (const auto& input : op->inputs()) {
|
||||
if (input.get_shape().size() > 1 && shape[1] != input.get_shape()[1] && ngraph::shape_size(input.get_shape()) != 1) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
const Precision supportedPrecision = Precision::FP32;
|
||||
|
||||
bool dimRanksAreEqual = true;
|
||||
@@ -90,9 +71,9 @@ void MKLDNNSnippetNode::initSupportedPrimitiveDescriptors() {
|
||||
const size_t ndims = outputShapes[0].getRank();
|
||||
const bool isChannelsFirstApplicable = dnnl::impl::utils::one_of(ndims, 1, 2, 4, 5) && dimRanksAreEqual;
|
||||
// Todo: Snippets currently don't support per-channel broadcasting of Blocked descriptors because
|
||||
// canonicalization can't distinguish between <N, C, H, W, c> and <N, C, D, H, W> cases. So we need to pass an
|
||||
// additional parameter to canonicalization, see snippets::op::Subgraph::canonicalize for details.
|
||||
const bool isBlockedApplicable = dnnl::impl::utils::one_of(ndims, 4, 5) && dimRanksAreEqual && !hasBroadcastByC();
|
||||
// canonicalization can't distinguish between <N, C, H, W, c> and <N, C, D, H, W> cases.
|
||||
// See snippets::op::Subgraph::canonicalize for details.
|
||||
const bool isBlockedApplicable = dnnl::impl::utils::one_of(ndims, 4, 5) && dimRanksAreEqual;
|
||||
enum LayoutType {
|
||||
Planar,
|
||||
ChannelsFirst,
|
||||
@@ -255,15 +236,17 @@ static size_t argmax_rank(const std::vector<MKLDNNEdgeWeakPtr> &childEdges) {
|
||||
return max_rank_idx;
|
||||
}
|
||||
|
||||
static auto offset_calculation(std::vector<int64_t>& offset, const std::vector<int64_t>& dims_in, const std::vector<int64_t>& dims_out) -> void {
|
||||
int k = 1;
|
||||
static void offset_calculation(std::vector<size_t>& offset, const std::vector<size_t>& dims_in, const std::vector<size_t>& dims_out) {
|
||||
size_t k = 1;
|
||||
for (int i = offset.size() - 1; i >= 0; i--) {
|
||||
offset[i] = (dims_in[i] == dims_out[i]) ? k : 0;
|
||||
k *= dims_in[i];
|
||||
}
|
||||
}
|
||||
|
||||
static auto collapseLastDims(std::vector<int64_t>& dims, int dimsToCollapse) -> void {
|
||||
static auto collapseLastDims(std::vector<size_t>& dims, size_t dimsToCollapse) -> void {
|
||||
if (dimsToCollapse >= dims.size() - 1)
|
||||
IE_THROW() << "Got invalid number of dims to collapse. Expected < " << dims.size() - 1 << " got " << dimsToCollapse;
|
||||
for (int i = dims.size() - 2; i > dims.size() - dimsToCollapse - 2; i--) {
|
||||
dims[dims.size() - 1] *= dims[i];
|
||||
}
|
||||
@@ -278,63 +261,51 @@ static auto collapseLastDims(std::vector<int64_t>& dims, int dimsToCollapse) ->
|
||||
}
|
||||
|
||||
void MKLDNNSnippetNode::define_schedule() {
|
||||
auto edgeToBlockedShape = [](const MKLDNNEdgePtr& edge) {
|
||||
const auto blockedDesc = edge->getMemory().GetDescWithType<BlockedMemoryDesc>();
|
||||
ngraph::Shape shape(blockedDesc->getBlockDims());
|
||||
ngraph::AxisVector blocking(blockedDesc->getOrder());
|
||||
ngraph::element::Type precision = InferenceEngine::details::convertPrecision(blockedDesc->getPrecision());
|
||||
return ngraph::snippets::op::Subgraph::BlockedShape{shape, blocking, precision};
|
||||
};
|
||||
auto prependWithOnes = [this](const std::vector<size_t>& dims) {
|
||||
if (tensorRank <= dims.size())
|
||||
return dims;
|
||||
VectorDims result(tensorRank, 1);
|
||||
std::copy(dims.begin(), dims.end(), &result[tensorRank - dims.size()]);
|
||||
return result;
|
||||
};
|
||||
ngraph::snippets::op::Subgraph::BlockedShapeVector input_blocked_shapes;
|
||||
for (size_t i = 0; i < inputShapes.size(); i++)
|
||||
input_blocked_shapes.push_back(edgeToBlockedShape(getParentEdgesAtPort(i)[0]));
|
||||
|
||||
ngraph::snippets::op::Subgraph::BlockedShapeVector output_blocked_shapes;
|
||||
for (size_t i = 0; i < outputShapes.size(); i++)
|
||||
output_blocked_shapes.push_back(edgeToBlockedShape(getChildEdgesAtPort(i)[0]));
|
||||
exec_domain = snippet->canonicalize(output_blocked_shapes, input_blocked_shapes);
|
||||
// initialize by maximum output dimension. Dimensions of outputs should be broadcastable
|
||||
tensorRank = std::max(static_cast<size_t>(rank6D), exec_domain.size());
|
||||
// Canonicalization broadcasts inputs and outputs to max input rank, which can be smaller than tensorRank
|
||||
// prepend to enable 6D scheduler
|
||||
exec_domain = prependWithOnes(exec_domain);
|
||||
const auto &body = snippet->get_body();
|
||||
for (const auto& p : body->get_parameters()) {
|
||||
dims_in.emplace_back(prependWithOnes(p->get_shape()));
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < body->get_output_size(); i++) {
|
||||
dims_out.push_back(prependWithOnes(body->get_output_shape(i)));
|
||||
}
|
||||
|
||||
const auto config = getSelectedPrimitiveDescriptor()->getConfig();
|
||||
const auto dataSize = config.inConfs[0].getMemDesc()->getPrecision().size();
|
||||
// store to use as an execution domain
|
||||
max_rank_out_desc_idx = argmax_rank(getChildEdges());
|
||||
const auto outBlockingDesc_maxRank = getChildEdgeAt(max_rank_out_desc_idx)->getMemory().GetDescWithType<BlockedMemoryDesc>();
|
||||
// initialize by maximum output dimension. Dimensions of outputs should be broadcastable
|
||||
tensorRank = std::max(static_cast<size_t>(rank6D), outBlockingDesc_maxRank->getBlockDims().size());
|
||||
|
||||
auto initDims = [this, config, &outBlockingDesc_maxRank](size_t tensorRank) {
|
||||
// assume all input sizes are even
|
||||
const size_t inputNum = getParentEdges().size();
|
||||
|
||||
dims_in.resize(inputNum);
|
||||
for (size_t i = 0; i < inputNum; i++) {
|
||||
dims_in[i].resize(tensorRank, 1);
|
||||
}
|
||||
|
||||
const auto outOrder = outBlockingDesc_maxRank->getOrder();
|
||||
for (size_t i = 0; i < inputNum; i++) {
|
||||
auto inBlockingDesc = getParentEdgeAt(i)->getMemory().GetDescWithType<BlockedMemoryDesc>();
|
||||
size_t rank = inBlockingDesc->getBlockDims().size();
|
||||
|
||||
// WA to normalize blocked and planar layouts
|
||||
// not actual thought, since [§] doesn't support mixed layouts yet
|
||||
auto inOrder = inBlockingDesc->getOrder();
|
||||
size_t startOff = outOrder.size() != outBlockingDesc_maxRank->getShape().getRank() &&
|
||||
outOrder.back() != inOrder.back() ? 1 : 0;
|
||||
for (size_t j = 0; j < rank; j++) {
|
||||
dims_in[i][dims_in[i].size() - 1 - j - startOff] = inBlockingDesc->getBlockDims()[rank - 1 - j];
|
||||
}
|
||||
}
|
||||
|
||||
// assume all output sizes are even
|
||||
const size_t outputNum = config.outConfs.size();
|
||||
|
||||
dims_out.resize(outputNum);
|
||||
for (size_t i = 0; i < outputNum; i++) {
|
||||
dims_out[i].resize(tensorRank, 1);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < outputNum; i++) {
|
||||
auto outBlockingDesc = getChildEdgeAt(i)->getMemory().GetDescWithType<BlockedMemoryDesc>();
|
||||
size_t rank = outBlockingDesc->getBlockDims().size();
|
||||
|
||||
for (size_t j = 0; j < rank; j++) {
|
||||
dims_out[i][dims_out[i].size() - 1 - j] = outBlockingDesc->getBlockDims()[rank - 1 - j];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
auto initOffsets = [this, config, dataSize](size_t tensorRank) {
|
||||
auto initOffsets = [this, config, dataSize]() {
|
||||
// find max rank input among all outputs
|
||||
const size_t inputNum = getParentEdges().size();
|
||||
offsets_in.resize(inputNum);
|
||||
for (size_t i = 0; i < inputNum; i++) {
|
||||
offsets_in[i].resize(tensorRank, 1);
|
||||
offset_calculation(offsets_in[i], dims_in[i], dims_out[max_rank_out_desc_idx]);
|
||||
offset_calculation(offsets_in[i], dims_in[i], exec_domain);
|
||||
for (size_t j = 0; j < tensorRank; j++) {
|
||||
offsets_in[i][j] *= dataSize;
|
||||
}
|
||||
@@ -352,7 +323,7 @@ void MKLDNNSnippetNode::define_schedule() {
|
||||
offsets_out.resize(outputNum);
|
||||
for (size_t i = 0; i < outputNum; i++) {
|
||||
offsets_out[i].resize(tensorRank, 1);
|
||||
offset_calculation(offsets_out[i], dims_out[i], dims_out[max_rank_out_desc_idx]);
|
||||
offset_calculation(offsets_out[i], dims_out[i], exec_domain);
|
||||
for (size_t j = 0; j < tensorRank; j++) {
|
||||
offsets_out[i][j] *= dataSize;
|
||||
}
|
||||
@@ -367,13 +338,13 @@ void MKLDNNSnippetNode::define_schedule() {
|
||||
}
|
||||
};
|
||||
|
||||
auto find_dims_to_collapse = [this, config, &outBlockingDesc_maxRank]() -> int {
|
||||
auto find_dims_to_collapse = [this, config]() -> int {
|
||||
int collapsedDims = 0;
|
||||
size_t minimalConcurrency = parallel_get_max_threads();
|
||||
size_t minimalJitWorkAmount = 256;
|
||||
size_t currentJitWorkAmount = dims_out[max_rank_out_desc_idx].back();
|
||||
size_t currentJitWorkAmount = exec_domain.back();
|
||||
while (currentJitWorkAmount < minimalJitWorkAmount && currentJitWorkAmount < fullWorkAmount) {
|
||||
if (static_cast<int>(dims_out[max_rank_out_desc_idx].size()) - collapsedDims - 2 < 0)
|
||||
if (static_cast<int>(exec_domain.size()) - collapsedDims - 2 < 0)
|
||||
break;
|
||||
|
||||
bool canCollapse = true;
|
||||
@@ -385,7 +356,7 @@ void MKLDNNSnippetNode::define_schedule() {
|
||||
}
|
||||
}
|
||||
|
||||
size_t nextJitWorkAmount = currentJitWorkAmount * dims_out[max_rank_out_desc_idx][dims_out[max_rank_out_desc_idx].size() - 2];
|
||||
size_t nextJitWorkAmount = currentJitWorkAmount * exec_domain[exec_domain.size() - 2];
|
||||
if (fullWorkAmount / nextJitWorkAmount >= minimalConcurrency) {
|
||||
currentJitWorkAmount = nextJitWorkAmount;
|
||||
// if we cannot use dim collapsing we should use tile2D
|
||||
@@ -399,13 +370,13 @@ void MKLDNNSnippetNode::define_schedule() {
|
||||
}
|
||||
|
||||
collapsedDims++;
|
||||
for (size_t i = 0; i < dims_in.size(); i++) {
|
||||
collapseLastDims(dims_in[i], 1);
|
||||
}
|
||||
for (auto &d : dims_in)
|
||||
collapseLastDims(d, 1);
|
||||
|
||||
for (size_t i = 0; i < dims_out.size(); i++) {
|
||||
collapseLastDims(dims_out[i], 1);
|
||||
}
|
||||
for (auto &d : dims_out)
|
||||
collapseLastDims(d, 1);
|
||||
|
||||
collapseLastDims(exec_domain, 1);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
@@ -413,23 +384,23 @@ void MKLDNNSnippetNode::define_schedule() {
|
||||
return collapsedDims;
|
||||
};
|
||||
|
||||
auto initSchedulingInfo = [this, dataSize](const size_t tensorRank) -> void {
|
||||
auto initSchedulingInfo = [this, dataSize]() -> void {
|
||||
// initialize scheduling information
|
||||
sch_offsets_in.resize(offsets_in.size(), 0);
|
||||
sch_offsets_out.resize(offsets_out.size(), 0);
|
||||
sch_dims.resize(maxTileRank, 1);
|
||||
sch_dims[maxTileRank-1] = dims_out[max_rank_out_desc_idx].back();
|
||||
schedulerWorkAmount = fullWorkAmount / dims_out[max_rank_out_desc_idx].back();
|
||||
sch_dims[maxTileRank-1] = exec_domain.back();
|
||||
schedulerWorkAmount = fullWorkAmount / exec_domain.back();
|
||||
if (tileRank > 1) {
|
||||
sch_dims[maxTileRank - tileRank] = dims_out[max_rank_out_desc_idx][tensorRank - 2];
|
||||
schedulerWorkAmount /= dims_out[max_rank_out_desc_idx][tensorRank - 2];
|
||||
dims_out[max_rank_out_desc_idx][tensorRank - 2] = 1;
|
||||
sch_dims[maxTileRank - tileRank] = exec_domain[tensorRank - 2];
|
||||
schedulerWorkAmount /= exec_domain[tensorRank - 2];
|
||||
exec_domain[tensorRank - 2] = 1;
|
||||
|
||||
// update offsets for tile 2D because loaders have ptr shifts in some cases and stores have always ptrs shifts
|
||||
for (size_t i = 0; i < offsets_in.size(); i++) {
|
||||
int64_t offset = offsets_in[i][tensorRank - 2];
|
||||
if ((offset > dataSize) || (offset == 0 && dims_in[i].back() != 1)) {
|
||||
sch_offsets_in[i] = offset - dims_out[max_rank_out_desc_idx].back() * dataSize;
|
||||
sch_offsets_in[i] = offset - exec_domain.back() * dataSize;
|
||||
} else if (offset == dataSize) {
|
||||
sch_offsets_in[i] = offset;
|
||||
}
|
||||
@@ -437,49 +408,27 @@ void MKLDNNSnippetNode::define_schedule() {
|
||||
|
||||
for (size_t i = 0; i < offsets_out.size(); i++) {
|
||||
int64_t offset = offsets_out[i][tensorRank - 2];
|
||||
sch_offsets_out[i] = offset - dims_out[max_rank_out_desc_idx].back() * dataSize;
|
||||
sch_offsets_out[i] = offset - exec_domain.back() * dataSize;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
initDims(tensorRank);
|
||||
|
||||
fullWorkAmount = 1;
|
||||
for (size_t i = 0; i < dims_out[max_rank_out_desc_idx].size(); i++) {
|
||||
fullWorkAmount *= dims_out[max_rank_out_desc_idx][i];
|
||||
for (const auto &d : exec_domain) {
|
||||
fullWorkAmount *= d;
|
||||
}
|
||||
|
||||
const int collapsedDims = find_dims_to_collapse();
|
||||
batchDimIdx = tensorRank - outBlockingDesc_maxRank->getBlockDims().size() + collapsedDims;
|
||||
batchDimIdx = tensorRank - exec_domain.size();
|
||||
// Note that exec_domain can be modified inside find_dims_to_collapse() and/or initSchedulingInfo()
|
||||
find_dims_to_collapse();
|
||||
|
||||
initOffsets(tensorRank);
|
||||
initSchedulingInfo(tensorRank);
|
||||
initOffsets();
|
||||
initSchedulingInfo();
|
||||
}
|
||||
|
||||
void MKLDNNSnippetNode::generate() {
|
||||
std::vector<MKLDNNEdgePtr> input_first_row;
|
||||
for (size_t i = 0; i < inputShapes.size(); i++)
|
||||
input_first_row.push_back(getParentEdgesAtPort(i)[0]);
|
||||
|
||||
auto edgeToBlockedShape = [](const MKLDNNEdgePtr& edge) -> ngraph::snippets::op::Subgraph::BlockedShape {
|
||||
const auto blockedDesc = edge->getMemory().GetDescWithType<BlockedMemoryDesc>();
|
||||
ngraph::Shape shape(blockedDesc->getBlockDims());
|
||||
ngraph::AxisVector blocking(blockedDesc->getOrder());
|
||||
ngraph::element::Type precision = InferenceEngine::details::convertPrecision(blockedDesc->getPrecision());
|
||||
return std::make_tuple(shape, blocking, precision);
|
||||
};
|
||||
ngraph::snippets::op::Subgraph::BlockedShapeVector input_blocked_shapes;
|
||||
std::transform(input_first_row.begin(), input_first_row.end(), std::back_inserter(input_blocked_shapes), edgeToBlockedShape);
|
||||
|
||||
std::vector<MKLDNNEdgePtr> output_first_row;
|
||||
for (size_t i = 0; i < outputShapes.size(); i++)
|
||||
// Can it go with difference shape or precision to different edges? I assume no.
|
||||
output_first_row.push_back(getChildEdgesAtPort(i)[0]);
|
||||
|
||||
ngraph::snippets::op::Subgraph::BlockedShapeVector output_blocked_shapes;
|
||||
std::transform(output_first_row.begin(), output_first_row.end(), std::back_inserter(output_blocked_shapes), edgeToBlockedShape);
|
||||
jit_snippets_compile_args jcp;
|
||||
jcp.output_dims = dims_out[max_rank_out_desc_idx];
|
||||
jcp.output_dims = exec_domain;
|
||||
std::copy(sch_dims.begin(), sch_dims.end(), jcp.scheduler_dims);
|
||||
std::copy(sch_offsets_in.begin(), sch_offsets_in.end(), jcp.scheduler_offsets);
|
||||
std::copy(sch_offsets_out.begin(), sch_offsets_out.end(), &jcp.scheduler_offsets[sch_offsets_in.size()]);
|
||||
@@ -496,11 +445,11 @@ void MKLDNNSnippetNode::generate() {
|
||||
auto b = offsets_out[i].begin();
|
||||
std::copy(b, b + harness_num_dims, &jcp.data_offsets[(inputShapes.size() + i) * harness_num_dims]);
|
||||
}
|
||||
schedule = snippet->generate(output_blocked_shapes, input_blocked_shapes, reinterpret_cast<void*>(&jcp));
|
||||
schedule = snippet->generate(reinterpret_cast<void*>(&jcp));
|
||||
}
|
||||
|
||||
void MKLDNNSnippetNode::schedule_6d(const jit_snippets_call_args& call_args) const {
|
||||
const auto& dom = dims_out[max_rank_out_desc_idx];
|
||||
const auto& dom = exec_domain;
|
||||
// < N, C, H, W > < 1, 1, N, C*H*W>
|
||||
parallel_for5d(dom[0], dom[1], dom[2], dom[3], dom[4],
|
||||
[&](int64_t d0, int64_t d1, int64_t d2, int64_t d3, int64_t d4) {
|
||||
@@ -510,7 +459,7 @@ void MKLDNNSnippetNode::schedule_6d(const jit_snippets_call_args& call_args) con
|
||||
}
|
||||
|
||||
void MKLDNNSnippetNode::schedule_nt(const jit_snippets_call_args& call_args) const {
|
||||
const auto& work_size = dims_out[max_rank_out_desc_idx];
|
||||
const auto& work_size = exec_domain;
|
||||
parallel_nt(0, [&](const int ithr, const int nthr) {
|
||||
size_t start = 0, end = 0;
|
||||
splitter(schedulerWorkAmount, nthr, ithr, start, end);
|
||||
|
||||
@@ -61,7 +61,7 @@ private:
|
||||
|
||||
// Holds index of output used as in execution domain
|
||||
// it should be compatible with a schedule's work size
|
||||
size_t max_rank_out_desc_idx = 0;
|
||||
std::vector<size_t> exec_domain = {};
|
||||
|
||||
/// scheduling info
|
||||
size_t batchDimIdx = 0;
|
||||
@@ -74,13 +74,13 @@ private:
|
||||
std::vector<MKLDNNMemoryPtr> srcMemPtrs = {};
|
||||
std::vector<MKLDNNMemoryPtr> dstMemPtrs = {};
|
||||
|
||||
std::vector<std::vector<int64_t>> dims_in = {};
|
||||
std::vector<std::vector<int64_t>> offsets_in = {};
|
||||
std::vector<std::vector<size_t>> dims_in = {};
|
||||
std::vector<std::vector<size_t>> offsets_in = {};
|
||||
std::vector<ptrdiff_t> start_offset_in = {};
|
||||
std::vector<ptrdiff_t> start_offset_out = {};
|
||||
|
||||
std::vector<std::vector<int64_t>> dims_out = {};
|
||||
std::vector<std::vector<int64_t>> offsets_out = {};
|
||||
std::vector<std::vector<size_t>> dims_out = {};
|
||||
std::vector<std::vector<size_t>> offsets_out = {};
|
||||
|
||||
std::vector<int64_t> sch_dims = {};
|
||||
std::vector<int64_t> sch_offsets_in = {};
|
||||
|
||||
Reference in New Issue
Block a user