[CPU] Snippets throughput mode fixes (#9488)

This commit is contained in:
Ivan Novoselov
2022-02-08 17:58:42 +03:00
committed by GitHub
parent dfc738b493
commit b47b8ad4bf
11 changed files with 330 additions and 266 deletions

View File

@@ -16,29 +16,24 @@ namespace op {
* @brief Generated by Canonicalization for a scalar constant Shape() == {1}
* @ingroup snippets
*/
class Scalar : public ngraph::op::Constant {
class Scalar : public ov::op::v0::Constant {
public:
OPENVINO_OP("Scalar", "SnippetsOpset", ngraph::op::Constant);
OPENVINO_OP("Scalar", "SnippetsOpset", ov::op::v0::Constant);
Scalar() = default;
Scalar(const std::shared_ptr<runtime::Tensor>& tensor) : Constant(tensor) {}
template <typename T>
Scalar(const element::Type& type, Shape shape, const std::vector<T>& values) : Constant(type, shape, values) {}
Scalar(const element::Type& type, const Shape& shape) : Constant(type, shape) {}
template <class T, class = typename std::enable_if<std::is_fundamental<T>::value>::type>
Scalar(const element::Type& type, Shape shape, T value) : Constant(type, shape, value) {}
Scalar(const element::Type& type, Shape shape, const std::vector<std::string>& values) : Constant(type, shape, values) {}
Scalar(const element::Type& type, const Shape& shape, const void* data) : Constant(type, shape, data) {}
Scalar(const Constant& other) : Constant(other) {}
Scalar(const Scalar& other) : Constant(other) {}
Scalar& operator=(const Scalar&) = delete;
~Scalar() override {}
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override {
check_new_args_count(this, new_args);
return std::make_shared<Scalar>(*this);
Scalar(const element::Type& type, Shape shape, T value) : Constant(type, shape, value) {
constructor_validate_and_infer_types();
}
explicit Scalar(const Constant& other) : Constant(other) {
constructor_validate_and_infer_types();
}
Scalar(const Scalar& other) : Constant(other) {
constructor_validate_and_infer_types();
}
Scalar& operator=(const Scalar&) = delete;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
void validate_and_infer_types() override;
};
} // namespace op

View File

@@ -88,14 +88,14 @@ public:
return m_generator;
}
std::shared_ptr<Subgraph> make_canonical_from_this();
snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes,
ngraph::pass::Manager opt = ngraph::pass::Manager(), const void* compile_params = nullptr);
ngraph::pass::Manager& opt, const void* compile_params = nullptr);
snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes,
const void* compile_params = nullptr);
/// Set a new body for the op; body needs to satisfy requirements on inputs/outputs
void set_body(std::shared_ptr<ov::Model> body);
snippets::Schedule generate(ngraph::pass::Manager &opt, const void* compile_params = nullptr);
snippets::Schedule generate(const void* compile_params = nullptr);
Shape canonicalize(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes);
// plugin sets generator for a snippet to some specific generator.
// it's going to be replaced with Jitters table later
@@ -109,9 +109,8 @@ public:
static auto wrap_node_as_subgraph(const std::shared_ptr<ngraph::Node>& node) -> std::shared_ptr<Subgraph>;
private:
void canonicalize(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes);
void convert_to_snippet_dialect();
Shape exec_domain;
std::shared_ptr<ov::Model> m_body;
std::shared_ptr<ngraph::snippets::Generator> m_generator;
};

View File

@@ -0,0 +1,27 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/pass/graph_rewrite.hpp>
#include <ngraph/pattern/matcher.hpp>
namespace ngraph {
namespace snippets {
namespace pass {
/**
* @interface ConvertConstantsToScalars
* @brief Replace only constants which are should be represented as scalars during code generation.
* Only single-value (0D) constants are currently supported.
* @ingroup snippets
*/
class ConvertConstantsToScalars: public ngraph::pass::MatcherPass {
public:
ConvertConstantsToScalars();
};
} // namespace pass
} // namespace snippets
} // namespace ngraph

View File

@@ -0,0 +1,26 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/pass/graph_rewrite.hpp>
#include <ngraph/pattern/matcher.hpp>
namespace ngraph {
namespace snippets {
namespace pass {
/**
* @interface ConvertConstantsToScalars
* @brief Replace Power with a scalar input with snippets::op::PowerStatic for generation of a more optimal code.
* @ingroup snippets
*/
class ConvertPowerToPowerStatic: public ngraph::pass::MatcherPass {
public:
ConvertPowerToPowerStatic();
};
} // namespace pass
} // namespace snippets
} // namespace ngraph

View File

@@ -0,0 +1,22 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets/op/scalar.hpp"
using namespace ngraph;
std::shared_ptr<Node> snippets::op::Scalar::clone_with_new_inputs(const OutputVector& new_args) const {
check_new_args_count(this, new_args);
return std::make_shared<Scalar>(*this);
}
// Scalar currently supports only one-element constants, this could be changed in the future
void snippets::op::Scalar::validate_and_infer_types() {
Constant::validate_and_infer_types();
auto out_pshape = get_output_partial_shape(0);
NODE_VALIDATION_CHECK(this, out_pshape.is_static(), "Scalar supports only static input shapes");
NODE_VALIDATION_CHECK(this, out_pshape.get_shape().empty() || ov::shape_size(out_pshape.get_shape()) == 1,
"Scalar supports only one-element constants, got ", out_pshape.get_shape(),
" shape");
}

View File

@@ -10,6 +10,8 @@
#include "snippets/pass/insert_movebroadcast.hpp"
#include "snippets/pass/load_movebroadcast_to_broadcastload.hpp"
#include "snippets/pass/assign_registers.hpp"
#include "snippets/pass/convert_constants_to_scalars.hpp"
#include "snippets/pass/convert_power_to_powerstatic.hpp"
#include <ngraph/pass/manager.hpp>
#include <openvino/pass/serialize.hpp>
@@ -115,79 +117,102 @@ auto snippets::op::Subgraph::wrap_node_as_subgraph(const std::shared_ptr<ov::Nod
return subgraph;
}
std::shared_ptr<snippets::op::Subgraph> snippets::op::Subgraph::make_canonical_from_this() {
INTERNAL_OP_SCOPE(Subgraph);
ngraph::OutputVector subgraph_node_inputs;
for (auto input : this->input_values()) {
subgraph_node_inputs.push_back(input);
}
auto new_body = ov::clone_model(*this->get_body().get());
auto snippet = std::make_shared<op::Subgraph>(subgraph_node_inputs, new_body);
ngraph::copy_runtime_info(this->shared_from_this(), snippet);
snippet->set_friendly_name(this->get_friendly_name());
snippet->set_generator(this->m_generator);
return snippet;
}
// We also can think of canonization as of pass to copy original subgraph and transforming it to canonical form suitable for code generation
// pass actual parameters and results shapes to generate for as well as channel mapping,
// Todo: we need to distinguish between 5d tensors that represents <N, C, H, W, c> and <N, C, D, H, W> somehow like locked dimensions
// ngraph::AxisVector to code
void snippets::op::Subgraph::canonicalize(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes) {
///
/// \brief Canonization transforms original subgraph and to canonical form suitable for code generation. In particular,
/// it handles supported layout conversions, broadcasts inputs and outputs to a single rank and layout. Canonicalization
/// returns master-shape (max rank + max dimensions over all outputs) that can be used for scheduling.
/// Canonicalization currently supports only the following layout conversions:
/// * None: all inputs have the same layout
/// * Planar + blocked: some inputs have blocked, and some have planar layouts, e.g. <N, C, H, W, c> + <N, C, H, W>
Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes) {
INTERNAL_OP_SCOPE(Subgraph);
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::canonicalize")
NODE_VALIDATION_CHECK(this, input_shapes.size() == m_body->get_parameters().size(),
"Number of parameters for snippet doesn't match passed to generate method: ", input_shapes.size(), " vs ", m_body->get_parameters().size(), ".");
NODE_VALIDATION_CHECK(this, inputShapes.size() == m_body->get_parameters().size(),
"Number of parameters for snippet doesn't match passed to generate method: ", inputShapes.size(), " vs ", m_body->get_parameters().size(), ".");
NODE_VALIDATION_CHECK(this, output_shapes.size() == m_body->get_results().size(),
"number of results for snippet doesn't match passed to generate method: ", output_shapes.size(), " vs ", m_body->get_results().size(), ".");
// replace only constants which are actually should be represented as scalars during code generation and probably move this step a bit later
for (auto op : m_body->get_ordered_ops()) {
if (auto constant = ngraph::as_type_ptr<opset1::Constant>(op)) {
auto scalar = std::make_shared<snippets::op::Scalar>(*constant);
scalar->set_friendly_name(constant->get_friendly_name());
ngraph::copy_runtime_info(constant, scalar);
ngraph::replace_node(constant, scalar);
}
NODE_VALIDATION_CHECK(this, outputShapes.size() == m_body->get_results().size(),
"number of results for snippet doesn't match passed to generate method: ", outputShapes.size(), " vs ", m_body->get_results().size(), ".");
// todo: does it allowed to have outputs with different layouts? I assume no, remove if invalid
const AxisVector outOrder = get<1>(outputShapes[0]);
for (size_t i = 1; i < outputShapes.size(); i++) {
const AxisVector order_i = get<1>(outputShapes[i]);
NODE_VALIDATION_CHECK(this, outOrder.size() == order_i.size() && equal(outOrder.begin(), outOrder.end(), order_i.begin()),
"Snippets output shapes must have the same layout");
}
// it should be in subgraph node to be aligned with internal and external parameter list, but adding this for testing
// TODO: store blocking into to Parameter's rt_info for future propagation
for (size_t i = 0; i < m_body->get_parameters().size(); i++) {
auto param = m_body->get_parameters()[i];
if (param->get_shape().size() < 4) {
std::vector<size_t> shape(4, 1);
std::copy(param->get_shape().begin(), param->get_shape().end(), &shape.at(4 - (param->get_shape().size() == 0 ? 1 : param->get_shape().size())) );
m_body->replace_parameter(i, std::make_shared<opset1::Parameter>(param->get_element_type(), ngraph::Shape(shape)));
} else if (param->get_shape().size() >= 4) {
if (param->get_element_type() != std::get<2>(input_shapes[i])) {
throw ngraph::ngraph_error("changes in presision. Is it legal??");
auto getMaxRankBlockedShape = [](const BlockedShapeVector& blockedShapes) -> const BlockedShape& {
return *std::max_element(blockedShapes.begin(), blockedShapes.end(),
[&](const BlockedShape& lhs, const BlockedShape& rhs) {
return std::get<0>(lhs).size() < std::get<0>(rhs).size();
});
};
Shape baseShape;
AxisVector baseOrder;
std::tie(baseShape, baseOrder, std::ignore) = getMaxRankBlockedShape(inputShapes);
const auto baseRank = baseShape.size();
const bool baseIsBlocked = baseOrder.size() != std::set<size_t>(baseOrder.begin(), baseOrder.end()).size();
for (size_t i = 0; i < inputShapes.size(); i++) {
const auto &blockedShape = inputShapes[i];
Shape inShape;
AxisVector inOrder;
element::Type inType;
std::tie(inShape, inOrder, inType) = blockedShape;
const auto inRank = inShape.size();
NODE_VALIDATION_CHECK(this, inRank <= baseRank, "Input rank can't be larger than output rank in snippets.");
if (inRank < baseRank) {
Shape newShape(baseRank, 1);
// todo: more complicated logics is needed if we want to merge smth else than blocked and planar
// could be done by PartialShape::broadcast_merge_into, but this way is faster
size_t startOffset = baseRank - inRank;
if (baseIsBlocked) {
const bool inIsNotBlocked = inOrder.size() == std::set<size_t>(inOrder.begin(), inOrder.end()).size();
NODE_VALIDATION_CHECK(this, inIsNotBlocked, "Snippets don't support conversion between blocked layouts of different ranks");
startOffset--;
}
m_body->replace_parameter(i, std::make_shared<opset1::Parameter>(std::get<2>(input_shapes[i]), std::get<0>(input_shapes[i])));
std::copy(inShape.begin(), inShape.end(), &newShape[startOffset]);
inShape = move(newShape);
} else {
// todo: 4d blocked + 5d planar layouts are not supported: <N, C, H, W, c> + <N, C, D, H, W>
NODE_VALIDATION_CHECK(this,
equal(baseOrder.begin(), baseOrder.end(), inOrder.begin()),
"Snippets canonicalization got input shapes of equal ranks but different layouts, which is not supported");
}
ov::PartialShape tmpPShape(baseShape);
NODE_VALIDATION_CHECK(this,
PartialShape::broadcast_merge_into(tmpPShape, inShape, ::ngraph::op::AutoBroadcastType::NUMPY),
"Failed to create broadcastable shapes in snippets canonicalization");
const auto paramShape = m_body->get_parameters()[i]->get_shape();
if (paramShape.size() != inShape.size() || !equal(paramShape.begin(), paramShape.end(), inShape.begin()))
m_body->replace_parameter(i, std::make_shared<opset1::Parameter>(inType, inShape));
}
m_body->validate_nodes_and_infer_types();
for (size_t i = 0; i < m_body->get_results().size(); i++) {
auto result = m_body->get_results()[i];
PartialShape partial(result->get_shape());
bool isCompatible = ngraph::PartialShape::broadcast_merge_into(partial, std::get<0>(output_shapes[i]), ::ngraph::op::AutoBroadcastType::NUMPY);
// equality check won't pass since we reshape without changes on external snippet edges
NODE_VALIDATION_CHECK(this, isCompatible, "Inferend and passed results shapes are difference for snippet : ",
result->get_shape(), " vs ", std::get<0>(output_shapes[i]), ".");
// Check that output shapes are broadcastable => can be scheduled
const auto& body_results = m_body->get_results();
PartialShape outPShape = body_results[0]->get_shape();
for (size_t i = 0; i < body_results.size(); i++) {
auto shape_i = body_results[i]->get_shape();
PartialShape pShape_i(shape_i);
// Check that the produced output shape corresponds to the passed shape
bool compatibleWithPassedShape = PartialShape::broadcast_merge_into(pShape_i, std::get<0>(outputShapes[i]),
::ngraph::op::AutoBroadcastType::NUMPY);
NODE_VALIDATION_CHECK(this, compatibleWithPassedShape, "Inferred and passed results shapes are difference for snippet : ",
shape_i, " vs ", std::get<0>(outputShapes[i]), ".");
// Check that output shapes are broadcastable to each other => can be scheduled
bool compatibleWithOtherOutputs = PartialShape::broadcast_merge_into(outPShape, shape_i,
::ngraph::op::AutoBroadcastType::NUMPY);
NODE_VALIDATION_CHECK(this, compatibleWithOtherOutputs, "Snippets output shapes must be numpy broadcastable");
}
exec_domain = outPShape.get_shape();
return exec_domain;
}
void snippets::op::Subgraph::convert_to_snippet_dialect() {
INTERNAL_OP_SCOPE(Subgraph);
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::convert_to_snippet_dialect")
ngraph::pass::Manager manager;
manager.register_pass<snippets::pass::ConvertConstantsToScalars>();
manager.register_pass<snippets::pass::ConvertPowerToPowerStatic>();
manager.register_pass<snippets::pass::InsertLoad>();
manager.register_pass<snippets::pass::InsertStore>();
manager.register_pass<snippets::pass::InsertMoveBroadcast>();
@@ -198,39 +223,27 @@ void snippets::op::Subgraph::convert_to_snippet_dialect() {
snippets::Schedule snippets::op::Subgraph::generate(const BlockedShapeVector& output_shapes,
const BlockedShapeVector& input_shapes,
const void* compile_params) {
return generate(output_shapes, input_shapes, ngraph::pass::Manager(), compile_params);
canonicalize(output_shapes, input_shapes);
return generate(compile_params);
}
snippets::Schedule snippets::op::Subgraph::generate(const BlockedShapeVector& output_shapes,
const BlockedShapeVector& input_shapes,
ngraph::pass::Manager opt,
ngraph::pass::Manager& opt,
const void* compile_params) {
canonicalize(output_shapes, input_shapes);
return generate(opt, compile_params);
}
snippets::Schedule snippets::op::Subgraph::generate(const void* compile_params) {
auto mngr = ngraph::pass::Manager();
return generate(mngr, compile_params);
}
snippets::Schedule snippets::op::Subgraph::generate(ngraph::pass::Manager& opt, const void* compile_params) {
INTERNAL_OP_SCOPE(Subgraph);
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::generate")
NGRAPH_CHECK(m_generator != nullptr, "generate is called while generator is not set");
canonicalize(output_shapes, input_shapes);
// Todo: ngraph::pass::Manager introduces appreciable overheads, especially while used on small graphs.
// So don't wrap this transformation as a MatcherPass, but rewrite convert_to_snippet_dialect() as a
// for loop to improve first-inference time.
// replace power with power static
for (auto op : m_body->get_ordered_ops()) {
if (ov::is_type<opset1::Power>(op) &&
ov::is_type<snippets::op::Scalar>(op->get_input_node_shared_ptr(1)) &&
ov::shape_size(op->get_input_shape(1)) == 1) {
auto power = ov::as_type_ptr<opset1::Power>(op);
auto scalar = ov::as_type_ptr<snippets::op::Scalar>(op->get_input_node_shared_ptr(1));
auto value = scalar->cast_vector<float>()[0];;
auto power_static = std::make_shared<snippets::op::PowerStatic>(power->input(0).get_source_output(), value);
power_static->set_friendly_name(power->get_friendly_name());
ngraph::copy_runtime_info(power, power_static);
ngraph::replace_node(power, power_static);
}
}
convert_to_snippet_dialect();
opt.run_passes(m_body);
@@ -253,27 +266,7 @@ snippets::Schedule snippets::op::Subgraph::generate(const BlockedShapeVector& ou
}
NGRAPH_CHECK(!constants.size(), "External constants detected. Snippet is illigal for scheduling");
// check resulting shapes are broadcastable to each other so can be scheduled
Shape work_size = m_body->output(0).get_shape();
for (size_t k = 0; k < m_body->get_output_size(); k++) {
auto shape = m_body->output(k).get_shape();
if (work_size.size() != shape.size()) {
throw ngraph_error("rank for all outputs of a snippet should match");
}
for (size_t i = 0; i < work_size.size(); i++) {
if (work_size[i] != shape[i]) {
if (work_size[i] == 1 || shape[i] == 1) {
work_size[i] = max(work_size[i], shape[i]);
} else {
throw ngraph_error("incompatible shapes for output graphs");
}
}
}
}
return {work_size, false /*canBeLinearized*/, ptr};
return {exec_domain, false /*canBeLinearized*/, ptr};
}
void snippets::op::Subgraph::print() const {

View File

@@ -0,0 +1,28 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <snippets/itt.hpp>
#include "snippets/snippets_isa.hpp"
#include "snippets/pass/convert_constants_to_scalars.hpp"
#include <ngraph/rt_info.hpp>
ngraph::snippets::pass::ConvertConstantsToScalars::ConvertConstantsToScalars() {
MATCHER_SCOPE(ConvertConstantsToScalars);
auto constants = std::make_shared<pattern::op::Label>(pattern::any_input(),
[](std::shared_ptr<Node> n) {
return ngraph::is_type<ov::op::v0::Constant>(n);
});
ngraph::graph_rewrite_callback callback = [this](ngraph::pattern::Matcher &m) {
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ConvertConstantsToScalars")
auto constant = as_type_ptr<ov::op::v0::Constant>(m.get_match_root());
auto scalar = std::make_shared<snippets::op::Scalar>(*constant);
scalar->set_friendly_name(constant->get_friendly_name());
ngraph::copy_runtime_info(constant, scalar);
ngraph::replace_node(constant, scalar);
return true;
};
register_matcher(std::make_shared<ov::pass::pattern::Matcher>(constants), callback);
}

View File

@@ -0,0 +1,31 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <snippets/itt.hpp>
#include "snippets/snippets_isa.hpp"
#include "snippets/pass/convert_power_to_powerstatic.hpp"
#include <ngraph/rt_info.hpp>
ngraph::snippets::pass::ConvertPowerToPowerStatic::ConvertPowerToPowerStatic() {
MATCHER_SCOPE(ConvertPowerToPowerStatic);
auto scalarPower = std::make_shared<pattern::op::Label>(pattern::any_input(),
[](std::shared_ptr<Node> n) {
return is_type<ov::op::v1::Power>(n) &&
is_type<snippets::op::Scalar>(n->get_input_node_shared_ptr(1));
});
ngraph::graph_rewrite_callback callback = [this](ngraph::pattern::Matcher &m) {
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ConvertConstantsToScalars")
auto power = ov::as_type_ptr<ov::op::v1::Power>(m.get_match_root());
auto scalar = ov::as_type_ptr<snippets::op::Scalar>(power->get_input_node_shared_ptr(1));
auto value = scalar->cast_vector<float>()[0];
auto power_static = std::make_shared<snippets::op::PowerStatic>(power->input(0).get_source_output(), value);
power_static->set_friendly_name(power->get_friendly_name());
ngraph::copy_runtime_info(power, power_static);
ngraph::replace_node(power, power_static);
return true;
};
register_matcher(std::make_shared<ov::pass::pattern::Matcher>(scalarPower), callback);
}

View File

@@ -24,7 +24,7 @@ struct jit_snippets_compile_args {
int64_t scheduler_dims[SNIPPETS_MAX_TILE_RANK] = {};
int64_t scheduler_offsets[SNIPPETS_MAX_SNIPPETS_DIMS] = {};
int64_t data_offsets[SNIPPETS_MAX_SNIPPETS_DIMS * SNIPPETS_MAX_HARNESS_DIMS] = {};
std::vector<int64_t> output_dims = {};
std::vector<size_t> output_dims = {};
};
///
/// \brief Kernel is the only entry point to Codogen Jit compilation. Kernel calculates appropriate data offsets,
@@ -361,13 +361,7 @@ class ScalarEmitter : public jit_emitter {
public:
ScalarEmitter(mkldnn::impl::cpu::x64::jit_generator* h, mkldnn::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
: jit_emitter(h, isa, n) {
auto out_pshape = n->output(0).get_tensor().get_partial_shape();
if (out_pshape.is_dynamic())
IE_THROW() << "ScalarEmitter supports only static input shapes";
if ( out_pshape.get_shape() != ov::Shape() && ov::shape_size(out_pshape.get_shape()) != 1)
IE_THROW() << "ScalarEmitter got invalid shape";
value = mkldnn::impl::cpu::x64::float2int(ov::as_type_ptr<ngraph::snippets::op::Scalar>(n)->cast_vector<float>()[0]);
push_arg_entry_of("scalar", value, true);
prepare_table();
}

View File

@@ -58,25 +58,6 @@ void MKLDNNSnippetNode::initSupportedPrimitiveDescriptors() {
if (!supportedPrimitiveDescriptors.empty())
return;
auto hasBroadcastByC = [this]() -> bool {
for (auto op : ngraph::as_type_ptr<ngraph::snippets::op::Subgraph>(snippet)->get_body()->get_ops()) {
if (ngraph::op::supports_auto_broadcast(op)) {
auto shape = op->get_input_shape(0);
// Filter out scalar empty shape Shape{}
if (ngraph::shape_size(shape) != 1) {
for (const auto& input : op->inputs()) {
if (input.get_shape().size() > 1 && shape[1] != input.get_shape()[1] && ngraph::shape_size(input.get_shape()) != 1) {
return true;
}
}
} else {
return false;
}
}
}
return false;
};
const Precision supportedPrecision = Precision::FP32;
bool dimRanksAreEqual = true;
@@ -90,9 +71,9 @@ void MKLDNNSnippetNode::initSupportedPrimitiveDescriptors() {
const size_t ndims = outputShapes[0].getRank();
const bool isChannelsFirstApplicable = dnnl::impl::utils::one_of(ndims, 1, 2, 4, 5) && dimRanksAreEqual;
// Todo: Snippets currently don't support per-channel broadcasting of Blocked descriptors because
// canonicalization can't distinguish between <N, C, H, W, c> and <N, C, D, H, W> cases. So we need to pass an
// additional parameter to canonicalization, see snippets::op::Subgraph::canonicalize for details.
const bool isBlockedApplicable = dnnl::impl::utils::one_of(ndims, 4, 5) && dimRanksAreEqual && !hasBroadcastByC();
// canonicalization can't distinguish between <N, C, H, W, c> and <N, C, D, H, W> cases.
// See snippets::op::Subgraph::canonicalize for details.
const bool isBlockedApplicable = dnnl::impl::utils::one_of(ndims, 4, 5) && dimRanksAreEqual;
enum LayoutType {
Planar,
ChannelsFirst,
@@ -255,15 +236,17 @@ static size_t argmax_rank(const std::vector<MKLDNNEdgeWeakPtr> &childEdges) {
return max_rank_idx;
}
static auto offset_calculation(std::vector<int64_t>& offset, const std::vector<int64_t>& dims_in, const std::vector<int64_t>& dims_out) -> void {
int k = 1;
static void offset_calculation(std::vector<size_t>& offset, const std::vector<size_t>& dims_in, const std::vector<size_t>& dims_out) {
size_t k = 1;
for (int i = offset.size() - 1; i >= 0; i--) {
offset[i] = (dims_in[i] == dims_out[i]) ? k : 0;
k *= dims_in[i];
}
}
static auto collapseLastDims(std::vector<int64_t>& dims, int dimsToCollapse) -> void {
static auto collapseLastDims(std::vector<size_t>& dims, size_t dimsToCollapse) -> void {
if (dimsToCollapse >= dims.size() - 1)
IE_THROW() << "Got invalid number of dims to collapse. Expected < " << dims.size() - 1 << " got " << dimsToCollapse;
for (int i = dims.size() - 2; i > dims.size() - dimsToCollapse - 2; i--) {
dims[dims.size() - 1] *= dims[i];
}
@@ -278,63 +261,51 @@ static auto collapseLastDims(std::vector<int64_t>& dims, int dimsToCollapse) ->
}
void MKLDNNSnippetNode::define_schedule() {
auto edgeToBlockedShape = [](const MKLDNNEdgePtr& edge) {
const auto blockedDesc = edge->getMemory().GetDescWithType<BlockedMemoryDesc>();
ngraph::Shape shape(blockedDesc->getBlockDims());
ngraph::AxisVector blocking(blockedDesc->getOrder());
ngraph::element::Type precision = InferenceEngine::details::convertPrecision(blockedDesc->getPrecision());
return ngraph::snippets::op::Subgraph::BlockedShape{shape, blocking, precision};
};
auto prependWithOnes = [this](const std::vector<size_t>& dims) {
if (tensorRank <= dims.size())
return dims;
VectorDims result(tensorRank, 1);
std::copy(dims.begin(), dims.end(), &result[tensorRank - dims.size()]);
return result;
};
ngraph::snippets::op::Subgraph::BlockedShapeVector input_blocked_shapes;
for (size_t i = 0; i < inputShapes.size(); i++)
input_blocked_shapes.push_back(edgeToBlockedShape(getParentEdgesAtPort(i)[0]));
ngraph::snippets::op::Subgraph::BlockedShapeVector output_blocked_shapes;
for (size_t i = 0; i < outputShapes.size(); i++)
output_blocked_shapes.push_back(edgeToBlockedShape(getChildEdgesAtPort(i)[0]));
exec_domain = snippet->canonicalize(output_blocked_shapes, input_blocked_shapes);
// initialize by maximum output dimension. Dimensions of outputs should be broadcastable
tensorRank = std::max(static_cast<size_t>(rank6D), exec_domain.size());
// Canonicalization broadcasts inputs and outputs to max input rank, which can be smaller than tensorRank
// prepend to enable 6D scheduler
exec_domain = prependWithOnes(exec_domain);
const auto &body = snippet->get_body();
for (const auto& p : body->get_parameters()) {
dims_in.emplace_back(prependWithOnes(p->get_shape()));
}
for (size_t i = 0; i < body->get_output_size(); i++) {
dims_out.push_back(prependWithOnes(body->get_output_shape(i)));
}
const auto config = getSelectedPrimitiveDescriptor()->getConfig();
const auto dataSize = config.inConfs[0].getMemDesc()->getPrecision().size();
// store to use as an execution domain
max_rank_out_desc_idx = argmax_rank(getChildEdges());
const auto outBlockingDesc_maxRank = getChildEdgeAt(max_rank_out_desc_idx)->getMemory().GetDescWithType<BlockedMemoryDesc>();
// initialize by maximum output dimension. Dimensions of outputs should be broadcastable
tensorRank = std::max(static_cast<size_t>(rank6D), outBlockingDesc_maxRank->getBlockDims().size());
auto initDims = [this, config, &outBlockingDesc_maxRank](size_t tensorRank) {
// assume all input sizes are even
const size_t inputNum = getParentEdges().size();
dims_in.resize(inputNum);
for (size_t i = 0; i < inputNum; i++) {
dims_in[i].resize(tensorRank, 1);
}
const auto outOrder = outBlockingDesc_maxRank->getOrder();
for (size_t i = 0; i < inputNum; i++) {
auto inBlockingDesc = getParentEdgeAt(i)->getMemory().GetDescWithType<BlockedMemoryDesc>();
size_t rank = inBlockingDesc->getBlockDims().size();
// WA to normalize blocked and planar layouts
// not actual thought, since [§] doesn't support mixed layouts yet
auto inOrder = inBlockingDesc->getOrder();
size_t startOff = outOrder.size() != outBlockingDesc_maxRank->getShape().getRank() &&
outOrder.back() != inOrder.back() ? 1 : 0;
for (size_t j = 0; j < rank; j++) {
dims_in[i][dims_in[i].size() - 1 - j - startOff] = inBlockingDesc->getBlockDims()[rank - 1 - j];
}
}
// assume all output sizes are even
const size_t outputNum = config.outConfs.size();
dims_out.resize(outputNum);
for (size_t i = 0; i < outputNum; i++) {
dims_out[i].resize(tensorRank, 1);
}
for (size_t i = 0; i < outputNum; i++) {
auto outBlockingDesc = getChildEdgeAt(i)->getMemory().GetDescWithType<BlockedMemoryDesc>();
size_t rank = outBlockingDesc->getBlockDims().size();
for (size_t j = 0; j < rank; j++) {
dims_out[i][dims_out[i].size() - 1 - j] = outBlockingDesc->getBlockDims()[rank - 1 - j];
}
}
};
auto initOffsets = [this, config, dataSize](size_t tensorRank) {
auto initOffsets = [this, config, dataSize]() {
// find max rank input among all outputs
const size_t inputNum = getParentEdges().size();
offsets_in.resize(inputNum);
for (size_t i = 0; i < inputNum; i++) {
offsets_in[i].resize(tensorRank, 1);
offset_calculation(offsets_in[i], dims_in[i], dims_out[max_rank_out_desc_idx]);
offset_calculation(offsets_in[i], dims_in[i], exec_domain);
for (size_t j = 0; j < tensorRank; j++) {
offsets_in[i][j] *= dataSize;
}
@@ -352,7 +323,7 @@ void MKLDNNSnippetNode::define_schedule() {
offsets_out.resize(outputNum);
for (size_t i = 0; i < outputNum; i++) {
offsets_out[i].resize(tensorRank, 1);
offset_calculation(offsets_out[i], dims_out[i], dims_out[max_rank_out_desc_idx]);
offset_calculation(offsets_out[i], dims_out[i], exec_domain);
for (size_t j = 0; j < tensorRank; j++) {
offsets_out[i][j] *= dataSize;
}
@@ -367,13 +338,13 @@ void MKLDNNSnippetNode::define_schedule() {
}
};
auto find_dims_to_collapse = [this, config, &outBlockingDesc_maxRank]() -> int {
auto find_dims_to_collapse = [this, config]() -> int {
int collapsedDims = 0;
size_t minimalConcurrency = parallel_get_max_threads();
size_t minimalJitWorkAmount = 256;
size_t currentJitWorkAmount = dims_out[max_rank_out_desc_idx].back();
size_t currentJitWorkAmount = exec_domain.back();
while (currentJitWorkAmount < minimalJitWorkAmount && currentJitWorkAmount < fullWorkAmount) {
if (static_cast<int>(dims_out[max_rank_out_desc_idx].size()) - collapsedDims - 2 < 0)
if (static_cast<int>(exec_domain.size()) - collapsedDims - 2 < 0)
break;
bool canCollapse = true;
@@ -385,7 +356,7 @@ void MKLDNNSnippetNode::define_schedule() {
}
}
size_t nextJitWorkAmount = currentJitWorkAmount * dims_out[max_rank_out_desc_idx][dims_out[max_rank_out_desc_idx].size() - 2];
size_t nextJitWorkAmount = currentJitWorkAmount * exec_domain[exec_domain.size() - 2];
if (fullWorkAmount / nextJitWorkAmount >= minimalConcurrency) {
currentJitWorkAmount = nextJitWorkAmount;
// if we cannot use dim collapsing we should use tile2D
@@ -399,13 +370,13 @@ void MKLDNNSnippetNode::define_schedule() {
}
collapsedDims++;
for (size_t i = 0; i < dims_in.size(); i++) {
collapseLastDims(dims_in[i], 1);
}
for (auto &d : dims_in)
collapseLastDims(d, 1);
for (size_t i = 0; i < dims_out.size(); i++) {
collapseLastDims(dims_out[i], 1);
}
for (auto &d : dims_out)
collapseLastDims(d, 1);
collapseLastDims(exec_domain, 1);
} else {
break;
}
@@ -413,23 +384,23 @@ void MKLDNNSnippetNode::define_schedule() {
return collapsedDims;
};
auto initSchedulingInfo = [this, dataSize](const size_t tensorRank) -> void {
auto initSchedulingInfo = [this, dataSize]() -> void {
// initialize scheduling information
sch_offsets_in.resize(offsets_in.size(), 0);
sch_offsets_out.resize(offsets_out.size(), 0);
sch_dims.resize(maxTileRank, 1);
sch_dims[maxTileRank-1] = dims_out[max_rank_out_desc_idx].back();
schedulerWorkAmount = fullWorkAmount / dims_out[max_rank_out_desc_idx].back();
sch_dims[maxTileRank-1] = exec_domain.back();
schedulerWorkAmount = fullWorkAmount / exec_domain.back();
if (tileRank > 1) {
sch_dims[maxTileRank - tileRank] = dims_out[max_rank_out_desc_idx][tensorRank - 2];
schedulerWorkAmount /= dims_out[max_rank_out_desc_idx][tensorRank - 2];
dims_out[max_rank_out_desc_idx][tensorRank - 2] = 1;
sch_dims[maxTileRank - tileRank] = exec_domain[tensorRank - 2];
schedulerWorkAmount /= exec_domain[tensorRank - 2];
exec_domain[tensorRank - 2] = 1;
// update offsets for tile 2D because loaders have ptr shifts in some cases and stores have always ptrs shifts
for (size_t i = 0; i < offsets_in.size(); i++) {
int64_t offset = offsets_in[i][tensorRank - 2];
if ((offset > dataSize) || (offset == 0 && dims_in[i].back() != 1)) {
sch_offsets_in[i] = offset - dims_out[max_rank_out_desc_idx].back() * dataSize;
sch_offsets_in[i] = offset - exec_domain.back() * dataSize;
} else if (offset == dataSize) {
sch_offsets_in[i] = offset;
}
@@ -437,49 +408,27 @@ void MKLDNNSnippetNode::define_schedule() {
for (size_t i = 0; i < offsets_out.size(); i++) {
int64_t offset = offsets_out[i][tensorRank - 2];
sch_offsets_out[i] = offset - dims_out[max_rank_out_desc_idx].back() * dataSize;
sch_offsets_out[i] = offset - exec_domain.back() * dataSize;
}
}
};
initDims(tensorRank);
fullWorkAmount = 1;
for (size_t i = 0; i < dims_out[max_rank_out_desc_idx].size(); i++) {
fullWorkAmount *= dims_out[max_rank_out_desc_idx][i];
for (const auto &d : exec_domain) {
fullWorkAmount *= d;
}
const int collapsedDims = find_dims_to_collapse();
batchDimIdx = tensorRank - outBlockingDesc_maxRank->getBlockDims().size() + collapsedDims;
batchDimIdx = tensorRank - exec_domain.size();
// Note that exec_domain can be modified inside find_dims_to_collapse() and/or initSchedulingInfo()
find_dims_to_collapse();
initOffsets(tensorRank);
initSchedulingInfo(tensorRank);
initOffsets();
initSchedulingInfo();
}
void MKLDNNSnippetNode::generate() {
std::vector<MKLDNNEdgePtr> input_first_row;
for (size_t i = 0; i < inputShapes.size(); i++)
input_first_row.push_back(getParentEdgesAtPort(i)[0]);
auto edgeToBlockedShape = [](const MKLDNNEdgePtr& edge) -> ngraph::snippets::op::Subgraph::BlockedShape {
const auto blockedDesc = edge->getMemory().GetDescWithType<BlockedMemoryDesc>();
ngraph::Shape shape(blockedDesc->getBlockDims());
ngraph::AxisVector blocking(blockedDesc->getOrder());
ngraph::element::Type precision = InferenceEngine::details::convertPrecision(blockedDesc->getPrecision());
return std::make_tuple(shape, blocking, precision);
};
ngraph::snippets::op::Subgraph::BlockedShapeVector input_blocked_shapes;
std::transform(input_first_row.begin(), input_first_row.end(), std::back_inserter(input_blocked_shapes), edgeToBlockedShape);
std::vector<MKLDNNEdgePtr> output_first_row;
for (size_t i = 0; i < outputShapes.size(); i++)
// Can it go with difference shape or precision to different edges? I assume no.
output_first_row.push_back(getChildEdgesAtPort(i)[0]);
ngraph::snippets::op::Subgraph::BlockedShapeVector output_blocked_shapes;
std::transform(output_first_row.begin(), output_first_row.end(), std::back_inserter(output_blocked_shapes), edgeToBlockedShape);
jit_snippets_compile_args jcp;
jcp.output_dims = dims_out[max_rank_out_desc_idx];
jcp.output_dims = exec_domain;
std::copy(sch_dims.begin(), sch_dims.end(), jcp.scheduler_dims);
std::copy(sch_offsets_in.begin(), sch_offsets_in.end(), jcp.scheduler_offsets);
std::copy(sch_offsets_out.begin(), sch_offsets_out.end(), &jcp.scheduler_offsets[sch_offsets_in.size()]);
@@ -496,11 +445,11 @@ void MKLDNNSnippetNode::generate() {
auto b = offsets_out[i].begin();
std::copy(b, b + harness_num_dims, &jcp.data_offsets[(inputShapes.size() + i) * harness_num_dims]);
}
schedule = snippet->generate(output_blocked_shapes, input_blocked_shapes, reinterpret_cast<void*>(&jcp));
schedule = snippet->generate(reinterpret_cast<void*>(&jcp));
}
void MKLDNNSnippetNode::schedule_6d(const jit_snippets_call_args& call_args) const {
const auto& dom = dims_out[max_rank_out_desc_idx];
const auto& dom = exec_domain;
// < N, C, H, W > < 1, 1, N, C*H*W>
parallel_for5d(dom[0], dom[1], dom[2], dom[3], dom[4],
[&](int64_t d0, int64_t d1, int64_t d2, int64_t d3, int64_t d4) {
@@ -510,7 +459,7 @@ void MKLDNNSnippetNode::schedule_6d(const jit_snippets_call_args& call_args) con
}
void MKLDNNSnippetNode::schedule_nt(const jit_snippets_call_args& call_args) const {
const auto& work_size = dims_out[max_rank_out_desc_idx];
const auto& work_size = exec_domain;
parallel_nt(0, [&](const int ithr, const int nthr) {
size_t start = 0, end = 0;
splitter(schedulerWorkAmount, nthr, ithr, start, end);

View File

@@ -61,7 +61,7 @@ private:
// Holds index of output used as in execution domain
// it should be compatible with a schedule's work size
size_t max_rank_out_desc_idx = 0;
std::vector<size_t> exec_domain = {};
/// scheduling info
size_t batchDimIdx = 0;
@@ -74,13 +74,13 @@ private:
std::vector<MKLDNNMemoryPtr> srcMemPtrs = {};
std::vector<MKLDNNMemoryPtr> dstMemPtrs = {};
std::vector<std::vector<int64_t>> dims_in = {};
std::vector<std::vector<int64_t>> offsets_in = {};
std::vector<std::vector<size_t>> dims_in = {};
std::vector<std::vector<size_t>> offsets_in = {};
std::vector<ptrdiff_t> start_offset_in = {};
std::vector<ptrdiff_t> start_offset_out = {};
std::vector<std::vector<int64_t>> dims_out = {};
std::vector<std::vector<int64_t>> offsets_out = {};
std::vector<std::vector<size_t>> dims_out = {};
std::vector<std::vector<size_t>> offsets_out = {};
std::vector<int64_t> sch_dims = {};
std::vector<int64_t> sch_offsets_in = {};