[CPU] Snippets throughput mode fixes (#9488)

2022-02-08 17:58:42 +03:00
parent dfc738b493
commit b47b8ad4bf
11 changed files with 330 additions and 266 deletions
--- a/src/common/snippets/include/snippets/op/scalar.hpp
+++ b/src/common/snippets/include/snippets/op/scalar.hpp
@@ -16,29 +16,24 @@ namespace op {
 * @brief Generated by Canonicalization for a scalar constant Shape() == {1}
 * @ingroup snippets
 */
-class Scalar  : public ngraph::op::Constant {
+class Scalar  : public ov::op::v0::Constant {
 public:
-    OPENVINO_OP("Scalar", "SnippetsOpset", ngraph::op::Constant);
+    OPENVINO_OP("Scalar", "SnippetsOpset", ov::op::v0::Constant);

-    Scalar() = default;
-    Scalar(const std::shared_ptr<runtime::Tensor>& tensor) : Constant(tensor) {}
-    template <typename T>
-    Scalar(const element::Type& type, Shape shape, const std::vector<T>& values) : Constant(type, shape, values) {}
-    Scalar(const element::Type& type, const Shape& shape) : Constant(type, shape) {}
    template <class T, class = typename std::enable_if<std::is_fundamental<T>::value>::type>
-    Scalar(const element::Type& type, Shape shape, T value) : Constant(type, shape, value) {}
-    Scalar(const element::Type& type, Shape shape, const std::vector<std::string>& values) : Constant(type, shape, values) {}
-    Scalar(const element::Type& type, const Shape& shape, const void* data) : Constant(type, shape, data) {}
-
-    Scalar(const Constant& other) : Constant(other) {}
-    Scalar(const Scalar& other) : Constant(other) {}
-    Scalar& operator=(const Scalar&) = delete;
-    ~Scalar() override {}
-
-    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override {
-        check_new_args_count(this, new_args);
-        return std::make_shared<Scalar>(*this);
+    Scalar(const element::Type& type, Shape shape, T value) : Constant(type, shape, value) {
+        constructor_validate_and_infer_types();
    }
+    explicit Scalar(const Constant& other) : Constant(other) {
+        constructor_validate_and_infer_types();
+    }
+    Scalar(const Scalar& other) : Constant(other) {
+        constructor_validate_and_infer_types();
+    }
+    Scalar& operator=(const Scalar&) = delete;
+
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    void validate_and_infer_types() override;
 };

 } // namespace op
--- a/src/common/snippets/include/snippets/op/subgraph.hpp
+++ b/src/common/snippets/include/snippets/op/subgraph.hpp
@@ -88,14 +88,14 @@ public:
        return m_generator;
    }

-    std::shared_ptr<Subgraph> make_canonical_from_this();

    snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes,
-                                ngraph::pass::Manager opt = ngraph::pass::Manager(), const void* compile_params = nullptr);
+                                ngraph::pass::Manager& opt, const void* compile_params = nullptr);
    snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes,
                                const void* compile_params = nullptr);
-    /// Set a new body for the op; body needs to satisfy requirements on inputs/outputs
-    void set_body(std::shared_ptr<ov::Model> body);
+    snippets::Schedule generate(ngraph::pass::Manager &opt, const void* compile_params = nullptr);
+    snippets::Schedule generate(const void* compile_params = nullptr);
+    Shape canonicalize(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes);

    // plugin sets generator for a snippet to some specific generator.
    // it's going to be replaced with Jitters table later
@@ -109,9 +109,8 @@ public:
    static auto wrap_node_as_subgraph(const std::shared_ptr<ngraph::Node>& node) -> std::shared_ptr<Subgraph>;

 private:
-    void canonicalize(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes);
    void convert_to_snippet_dialect();
-
+    Shape exec_domain;
    std::shared_ptr<ov::Model> m_body;
    std::shared_ptr<ngraph::snippets::Generator> m_generator;
 };
--- a/src/common/snippets/include/snippets/pass/convert_constants_to_scalars.hpp
+++ b/src/common/snippets/include/snippets/pass/convert_constants_to_scalars.hpp
@@ -0,0 +1,27 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+#include <ngraph/pattern/matcher.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface ConvertConstantsToScalars
+ * @brief Replace only constants which are should be represented as scalars during code generation.
+ * Only single-value (0D) constants are currently supported.
+ * @ingroup snippets
+ */
+class ConvertConstantsToScalars: public ngraph::pass::MatcherPass {
+public:
+    ConvertConstantsToScalars();
+};
+
+} // namespace pass
+} // namespace snippets
+} // namespace ngraph
--- a/src/common/snippets/include/snippets/pass/convert_power_to_powerstatic.hpp
+++ b/src/common/snippets/include/snippets/pass/convert_power_to_powerstatic.hpp
@@ -0,0 +1,26 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+#include <ngraph/pattern/matcher.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface ConvertConstantsToScalars
+ * @brief Replace Power with a scalar input with snippets::op::PowerStatic for generation of a more optimal code.
+ * @ingroup snippets
+ */
+class ConvertPowerToPowerStatic: public ngraph::pass::MatcherPass {
+public:
+    ConvertPowerToPowerStatic();
+};
+
+} // namespace pass
+} // namespace snippets
+} // namespace ngraph
--- a/src/common/snippets/src/op/scalar.cpp
+++ b/src/common/snippets/src/op/scalar.cpp
@@ -0,0 +1,22 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/op/scalar.hpp"
+
+using namespace ngraph;
+
+std::shared_ptr<Node> snippets::op::Scalar::clone_with_new_inputs(const OutputVector& new_args) const {
+    check_new_args_count(this, new_args);
+    return std::make_shared<Scalar>(*this);
+}
+
+// Scalar currently supports only one-element constants, this could be changed in the future
+void snippets::op::Scalar::validate_and_infer_types() {
+    Constant::validate_and_infer_types();
+    auto out_pshape = get_output_partial_shape(0);
+    NODE_VALIDATION_CHECK(this, out_pshape.is_static(), "Scalar supports only static input shapes");
+    NODE_VALIDATION_CHECK(this, out_pshape.get_shape().empty() || ov::shape_size(out_pshape.get_shape()) == 1,
+                      "Scalar supports only one-element constants, got ", out_pshape.get_shape(),
+                      " shape");
+}
--- a/src/common/snippets/src/op/subgraph.cpp
+++ b/src/common/snippets/src/op/subgraph.cpp
@@ -10,6 +10,8 @@
 #include "snippets/pass/insert_movebroadcast.hpp"
 #include "snippets/pass/load_movebroadcast_to_broadcastload.hpp"
 #include "snippets/pass/assign_registers.hpp"
+#include "snippets/pass/convert_constants_to_scalars.hpp"
+#include "snippets/pass/convert_power_to_powerstatic.hpp"

 #include <ngraph/pass/manager.hpp>
 #include <openvino/pass/serialize.hpp>
@@ -115,79 +117,102 @@ auto snippets::op::Subgraph::wrap_node_as_subgraph(const std::shared_ptr<ov::Nod

    return subgraph;
 }
-
-std::shared_ptr<snippets::op::Subgraph> snippets::op::Subgraph::make_canonical_from_this() {
-    INTERNAL_OP_SCOPE(Subgraph);
-    ngraph::OutputVector subgraph_node_inputs;
-    for (auto input : this->input_values()) {
-        subgraph_node_inputs.push_back(input);
-    }
-    auto new_body = ov::clone_model(*this->get_body().get());
-    auto snippet = std::make_shared<op::Subgraph>(subgraph_node_inputs, new_body);
-    ngraph::copy_runtime_info(this->shared_from_this(), snippet);
-    snippet->set_friendly_name(this->get_friendly_name());
-    snippet->set_generator(this->m_generator);
-
-    return snippet;
-}
-
-// We also can think of canonization as of pass to copy original subgraph and transforming it to canonical form suitable for code generation
-// pass actual parameters and results shapes to generate for as well as channel mapping,
-// Todo: we need to distinguish between 5d tensors that represents <N, C, H, W, c> and <N, C, D, H, W> somehow like locked dimensions
-//  ngraph::AxisVector to code
-void snippets::op::Subgraph::canonicalize(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes) {
+///
+/// \brief  Canonization transforms original subgraph and to canonical form suitable for code generation. In particular,
+///         it handles supported layout conversions, broadcasts inputs and outputs to a single rank and layout. Canonicalization
+///         returns master-shape (max rank + max dimensions over all outputs) that can be used for scheduling.
+///         Canonicalization currently supports only the following layout conversions:
+///             * None: all inputs have the same layout
+///             * Planar + blocked: some inputs have blocked, and some have planar layouts, e.g. <N, C, H, W, c> + <N, C, H, W>
+Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes) {
    INTERNAL_OP_SCOPE(Subgraph);
    OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::canonicalize")
-    NODE_VALIDATION_CHECK(this, input_shapes.size() == m_body->get_parameters().size(),
-        "Number of parameters for snippet doesn't match passed to generate method: ", input_shapes.size(), " vs ", m_body->get_parameters().size(), ".");
+    NODE_VALIDATION_CHECK(this, inputShapes.size() == m_body->get_parameters().size(),
+        "Number of parameters for snippet doesn't match passed to generate method: ", inputShapes.size(), " vs ", m_body->get_parameters().size(), ".");

-    NODE_VALIDATION_CHECK(this, output_shapes.size() == m_body->get_results().size(),
-        "number of results for snippet doesn't match passed to generate method: ", output_shapes.size(), " vs ", m_body->get_results().size(), ".");
-
-    // replace only constants which are actually should be represented as scalars during code generation and probably move this step a bit later
-    for (auto op : m_body->get_ordered_ops()) {
-        if (auto constant = ngraph::as_type_ptr<opset1::Constant>(op)) {
-            auto scalar = std::make_shared<snippets::op::Scalar>(*constant);
-            scalar->set_friendly_name(constant->get_friendly_name());
-            ngraph::copy_runtime_info(constant, scalar);
-            ngraph::replace_node(constant, scalar);
-        }
+    NODE_VALIDATION_CHECK(this, outputShapes.size() == m_body->get_results().size(),
+        "number of results for snippet doesn't match passed to generate method: ", outputShapes.size(), " vs ", m_body->get_results().size(), ".");
+    // todo: does it allowed to have outputs with different layouts? I assume no, remove if invalid
+    const AxisVector outOrder = get<1>(outputShapes[0]);
+    for (size_t i = 1; i < outputShapes.size(); i++) {
+        const AxisVector order_i = get<1>(outputShapes[i]);
+        NODE_VALIDATION_CHECK(this, outOrder.size() == order_i.size() && equal(outOrder.begin(), outOrder.end(), order_i.begin()),
+                              "Snippets output shapes must have the same layout");
    }
-
-
-
-    // it should be in subgraph node to be aligned with internal and external parameter list, but adding this for testing
-    // TODO: store blocking into to Parameter's rt_info for future propagation
-    for (size_t i = 0; i < m_body->get_parameters().size(); i++) {
-        auto param = m_body->get_parameters()[i];
-        if (param->get_shape().size() < 4) {
-            std::vector<size_t> shape(4, 1);
-            std::copy(param->get_shape().begin(), param->get_shape().end(), &shape.at(4 - (param->get_shape().size() == 0 ? 1 : param->get_shape().size())) );
-            m_body->replace_parameter(i, std::make_shared<opset1::Parameter>(param->get_element_type(), ngraph::Shape(shape)));
-        } else if (param->get_shape().size() >= 4) {
-            if (param->get_element_type() != std::get<2>(input_shapes[i])) {
-                throw ngraph::ngraph_error("changes in presision. Is it legal??");
+    auto getMaxRankBlockedShape = [](const BlockedShapeVector& blockedShapes) -> const BlockedShape& {
+        return *std::max_element(blockedShapes.begin(), blockedShapes.end(),
+                         [&](const BlockedShape& lhs, const BlockedShape& rhs) {
+                            return std::get<0>(lhs).size() < std::get<0>(rhs).size();
+                         });
+    };
+    Shape baseShape;
+    AxisVector baseOrder;
+    std::tie(baseShape, baseOrder, std::ignore) = getMaxRankBlockedShape(inputShapes);
+    const auto baseRank = baseShape.size();
+    const bool baseIsBlocked = baseOrder.size() != std::set<size_t>(baseOrder.begin(), baseOrder.end()).size();
+    for (size_t i = 0; i < inputShapes.size(); i++) {
+        const auto &blockedShape = inputShapes[i];
+        Shape inShape;
+        AxisVector inOrder;
+        element::Type inType;
+        std::tie(inShape, inOrder, inType) = blockedShape;
+        const auto inRank = inShape.size();
+        NODE_VALIDATION_CHECK(this, inRank <= baseRank, "Input rank can't be larger than output rank in snippets.");
+        if (inRank < baseRank) {
+            Shape newShape(baseRank, 1);
+            // todo: more complicated logics is needed if we want to merge smth else than blocked and planar
+            // could be done by PartialShape::broadcast_merge_into, but this way is faster
+            size_t startOffset = baseRank - inRank;
+            if (baseIsBlocked) {
+                const bool inIsNotBlocked = inOrder.size() == std::set<size_t>(inOrder.begin(), inOrder.end()).size();
+                NODE_VALIDATION_CHECK(this, inIsNotBlocked, "Snippets don't support conversion between blocked layouts of different ranks");
+                startOffset--;
            }
-            m_body->replace_parameter(i, std::make_shared<opset1::Parameter>(std::get<2>(input_shapes[i]), std::get<0>(input_shapes[i])));
+            std::copy(inShape.begin(), inShape.end(), &newShape[startOffset]);
+            inShape = move(newShape);
+        } else {
+            // todo: 4d blocked + 5d planar layouts are not supported: <N, C, H, W, c> + <N, C, D, H, W>
+            NODE_VALIDATION_CHECK(this,
+                                  equal(baseOrder.begin(), baseOrder.end(), inOrder.begin()),
+                                  "Snippets canonicalization got input shapes of equal ranks but different layouts, which is not supported");
        }
+        ov::PartialShape tmpPShape(baseShape);
+        NODE_VALIDATION_CHECK(this,
+                              PartialShape::broadcast_merge_into(tmpPShape, inShape, ::ngraph::op::AutoBroadcastType::NUMPY),
+                              "Failed to create broadcastable shapes in snippets canonicalization");
+        const auto paramShape = m_body->get_parameters()[i]->get_shape();
+        if (paramShape.size() != inShape.size() || !equal(paramShape.begin(), paramShape.end(), inShape.begin()))
+                m_body->replace_parameter(i, std::make_shared<opset1::Parameter>(inType, inShape));
    }

    m_body->validate_nodes_and_infer_types();

-    for (size_t i = 0; i < m_body->get_results().size(); i++) {
-        auto result = m_body->get_results()[i];
-        PartialShape partial(result->get_shape());
-        bool isCompatible = ngraph::PartialShape::broadcast_merge_into(partial, std::get<0>(output_shapes[i]), ::ngraph::op::AutoBroadcastType::NUMPY);
-        // equality check won't pass since we reshape without changes on external snippet edges
-        NODE_VALIDATION_CHECK(this, isCompatible, "Inferend and passed results shapes are difference for snippet : ",
-                                                  result->get_shape(), " vs ", std::get<0>(output_shapes[i]), ".");
+    // Check that output shapes are broadcastable => can be scheduled
+    const auto& body_results = m_body->get_results();
+    PartialShape outPShape = body_results[0]->get_shape();
+    for (size_t i = 0; i < body_results.size(); i++) {
+        auto shape_i = body_results[i]->get_shape();
+        PartialShape pShape_i(shape_i);
+        // Check that the produced output shape corresponds to the passed shape
+        bool compatibleWithPassedShape = PartialShape::broadcast_merge_into(pShape_i, std::get<0>(outputShapes[i]),
+                                                                              ::ngraph::op::AutoBroadcastType::NUMPY);
+        NODE_VALIDATION_CHECK(this, compatibleWithPassedShape, "Inferred and passed results shapes are difference for snippet : ",
+                              shape_i, " vs ", std::get<0>(outputShapes[i]), ".");
+        // Check that output shapes are broadcastable to each other => can be scheduled
+        bool compatibleWithOtherOutputs = PartialShape::broadcast_merge_into(outPShape, shape_i,
+                                                               ::ngraph::op::AutoBroadcastType::NUMPY);
+        NODE_VALIDATION_CHECK(this, compatibleWithOtherOutputs, "Snippets output shapes must be numpy broadcastable");
    }
+    exec_domain = outPShape.get_shape();
+    return exec_domain;
 }

 void snippets::op::Subgraph::convert_to_snippet_dialect() {
    INTERNAL_OP_SCOPE(Subgraph);
    OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::convert_to_snippet_dialect")
    ngraph::pass::Manager manager;
+    manager.register_pass<snippets::pass::ConvertConstantsToScalars>();
+    manager.register_pass<snippets::pass::ConvertPowerToPowerStatic>();
    manager.register_pass<snippets::pass::InsertLoad>();
    manager.register_pass<snippets::pass::InsertStore>();
    manager.register_pass<snippets::pass::InsertMoveBroadcast>();
@@ -198,39 +223,27 @@ void snippets::op::Subgraph::convert_to_snippet_dialect() {
 snippets::Schedule snippets::op::Subgraph::generate(const BlockedShapeVector& output_shapes,
                                                    const BlockedShapeVector& input_shapes,
                                                    const void* compile_params) {
-    return generate(output_shapes, input_shapes, ngraph::pass::Manager(), compile_params);
+    canonicalize(output_shapes, input_shapes);
+    return generate(compile_params);
 }

 snippets::Schedule snippets::op::Subgraph::generate(const BlockedShapeVector& output_shapes,
                                                    const BlockedShapeVector& input_shapes,
-                                                    ngraph::pass::Manager opt,
+                                                    ngraph::pass::Manager& opt,
                                                    const void* compile_params) {
+    canonicalize(output_shapes, input_shapes);
+    return generate(opt, compile_params);
+}
+
+snippets::Schedule snippets::op::Subgraph::generate(const void* compile_params) {
+    auto mngr = ngraph::pass::Manager();
+    return generate(mngr, compile_params);
+}
+
+snippets::Schedule snippets::op::Subgraph::generate(ngraph::pass::Manager& opt, const void* compile_params) {
    INTERNAL_OP_SCOPE(Subgraph);
    OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::generate")
    NGRAPH_CHECK(m_generator != nullptr, "generate is called while generator is not set");
-
-    canonicalize(output_shapes, input_shapes);
-
-    // Todo: ngraph::pass::Manager introduces appreciable overheads, especially while used on small graphs.
-    // So don't wrap this transformation as a MatcherPass, but rewrite convert_to_snippet_dialect() as a
-    // for loop to improve first-inference time.
-    // replace power with power static
-
-    for (auto op : m_body->get_ordered_ops()) {
-        if (ov::is_type<opset1::Power>(op) &&
-            ov::is_type<snippets::op::Scalar>(op->get_input_node_shared_ptr(1)) &&
-            ov::shape_size(op->get_input_shape(1)) == 1) {
-            auto power = ov::as_type_ptr<opset1::Power>(op);
-            auto scalar = ov::as_type_ptr<snippets::op::Scalar>(op->get_input_node_shared_ptr(1));
-            auto value = scalar->cast_vector<float>()[0];;
-            auto power_static = std::make_shared<snippets::op::PowerStatic>(power->input(0).get_source_output(), value);
-            power_static->set_friendly_name(power->get_friendly_name());
-            ngraph::copy_runtime_info(power, power_static);
-            ngraph::replace_node(power, power_static);
-        }
-    }
-
-
    convert_to_snippet_dialect();
    opt.run_passes(m_body);

@@ -253,27 +266,7 @@ snippets::Schedule snippets::op::Subgraph::generate(const BlockedShapeVector& ou
    }
    NGRAPH_CHECK(!constants.size(), "External constants detected. Snippet is illigal for scheduling");

-    // check resulting shapes are broadcastable to each other so can be scheduled
-    Shape work_size = m_body->output(0).get_shape();
-    for (size_t k = 0; k < m_body->get_output_size(); k++) {
-        auto shape = m_body->output(k).get_shape();
-
-        if (work_size.size() != shape.size()) {
-            throw ngraph_error("rank for all outputs of a snippet should match");
-        }
-
-        for (size_t i = 0; i < work_size.size(); i++) {
-            if (work_size[i] != shape[i]) {
-                if (work_size[i] == 1 || shape[i] == 1) {
-                    work_size[i] = max(work_size[i], shape[i]);
-                } else {
-                    throw ngraph_error("incompatible shapes for output graphs");
-                }
-            }
-        }
-    }
-
-    return {work_size, false /*canBeLinearized*/, ptr};
+    return {exec_domain, false /*canBeLinearized*/, ptr};
 }

 void snippets::op::Subgraph::print() const {
--- a/src/common/snippets/src/pass/convert_constants_to_scalars.cpp
+++ b/src/common/snippets/src/pass/convert_constants_to_scalars.cpp
@@ -0,0 +1,28 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+#include "snippets/snippets_isa.hpp"
+#include "snippets/pass/convert_constants_to_scalars.hpp"
+#include <ngraph/rt_info.hpp>
+
+
+ngraph::snippets::pass::ConvertConstantsToScalars::ConvertConstantsToScalars() {
+    MATCHER_SCOPE(ConvertConstantsToScalars);
+    auto constants = std::make_shared<pattern::op::Label>(pattern::any_input(),
+                                                    [](std::shared_ptr<Node> n) {
+                                                        return ngraph::is_type<ov::op::v0::Constant>(n);
+                                                    });
+    ngraph::graph_rewrite_callback callback = [this](ngraph::pattern::Matcher &m) {
+        OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ConvertConstantsToScalars")
+        auto constant = as_type_ptr<ov::op::v0::Constant>(m.get_match_root());
+        auto scalar = std::make_shared<snippets::op::Scalar>(*constant);
+        scalar->set_friendly_name(constant->get_friendly_name());
+        ngraph::copy_runtime_info(constant, scalar);
+        ngraph::replace_node(constant, scalar);
+
+        return true;
+    };
+    register_matcher(std::make_shared<ov::pass::pattern::Matcher>(constants), callback);
+}
--- a/src/common/snippets/src/pass/convert_power_to_powerstatic.cpp
+++ b/src/common/snippets/src/pass/convert_power_to_powerstatic.cpp
@@ -0,0 +1,31 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+#include "snippets/snippets_isa.hpp"
+#include "snippets/pass/convert_power_to_powerstatic.hpp"
+#include <ngraph/rt_info.hpp>
+
+
+ngraph::snippets::pass::ConvertPowerToPowerStatic::ConvertPowerToPowerStatic() {
+    MATCHER_SCOPE(ConvertPowerToPowerStatic);
+    auto scalarPower = std::make_shared<pattern::op::Label>(pattern::any_input(),
+                                                    [](std::shared_ptr<Node> n) {
+                                                        return is_type<ov::op::v1::Power>(n) &&
+                                                               is_type<snippets::op::Scalar>(n->get_input_node_shared_ptr(1));
+                                                    });
+    ngraph::graph_rewrite_callback callback = [this](ngraph::pattern::Matcher &m) {
+        OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ConvertConstantsToScalars")
+        auto power = ov::as_type_ptr<ov::op::v1::Power>(m.get_match_root());
+        auto scalar = ov::as_type_ptr<snippets::op::Scalar>(power->get_input_node_shared_ptr(1));
+        auto value = scalar->cast_vector<float>()[0];
+        auto power_static = std::make_shared<snippets::op::PowerStatic>(power->input(0).get_source_output(), value);
+        power_static->set_friendly_name(power->get_friendly_name());
+        ngraph::copy_runtime_info(power, power_static);
+        ngraph::replace_node(power, power_static);
+
+        return true;
+    };
+    register_matcher(std::make_shared<ov::pass::pattern::Matcher>(scalarPower), callback);
+}
--- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp
+++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp
@@ -24,7 +24,7 @@ struct jit_snippets_compile_args {
    int64_t scheduler_dims[SNIPPETS_MAX_TILE_RANK] = {};
    int64_t scheduler_offsets[SNIPPETS_MAX_SNIPPETS_DIMS] = {};
    int64_t data_offsets[SNIPPETS_MAX_SNIPPETS_DIMS * SNIPPETS_MAX_HARNESS_DIMS] = {};
-    std::vector<int64_t> output_dims = {};
+    std::vector<size_t> output_dims = {};
 };
 ///
 /// \brief    Kernel is the only entry point to Codogen Jit compilation. Kernel calculates appropriate data offsets,
@@ -361,13 +361,7 @@ class ScalarEmitter : public jit_emitter {
 public:
    ScalarEmitter(mkldnn::impl::cpu::x64::jit_generator* h, mkldnn::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
    : jit_emitter(h, isa, n) {
-        auto out_pshape = n->output(0).get_tensor().get_partial_shape();
-        if (out_pshape.is_dynamic())
-            IE_THROW() << "ScalarEmitter supports only static input shapes";
-        if ( out_pshape.get_shape() != ov::Shape() && ov::shape_size(out_pshape.get_shape()) != 1)
-            IE_THROW() << "ScalarEmitter got invalid shape";
        value = mkldnn::impl::cpu::x64::float2int(ov::as_type_ptr<ngraph::snippets::op::Scalar>(n)->cast_vector<float>()[0]);
-
        push_arg_entry_of("scalar", value, true);
        prepare_table();
    }
--- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp
+++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
@@ -58,25 +58,6 @@ void MKLDNNSnippetNode::initSupportedPrimitiveDescriptors() {
    if (!supportedPrimitiveDescriptors.empty())
        return;

-    auto hasBroadcastByC = [this]() -> bool {
-        for (auto op : ngraph::as_type_ptr<ngraph::snippets::op::Subgraph>(snippet)->get_body()->get_ops()) {
-            if (ngraph::op::supports_auto_broadcast(op)) {
-                auto shape = op->get_input_shape(0);
-                // Filter out scalar empty shape Shape{}
-                if (ngraph::shape_size(shape) != 1) {
-                    for (const auto& input : op->inputs()) {
-                        if (input.get_shape().size() > 1 && shape[1] != input.get_shape()[1] && ngraph::shape_size(input.get_shape()) != 1) {
-                            return true;
-                        }
-                    }
-                } else {
-                    return false;
-                }
-            }
-        }
-        return false;
-    };
-
    const Precision supportedPrecision = Precision::FP32;

    bool dimRanksAreEqual = true;
@@ -90,9 +71,9 @@ void MKLDNNSnippetNode::initSupportedPrimitiveDescriptors() {
    const size_t ndims = outputShapes[0].getRank();
    const bool isChannelsFirstApplicable = dnnl::impl::utils::one_of(ndims, 1, 2, 4, 5) && dimRanksAreEqual;
    // Todo: Snippets currently don't support per-channel broadcasting of Blocked descriptors because
-    //  canonicalization can't distinguish between <N, C, H, W, c> and <N, C, D, H, W> cases. So we need to pass an
-    //  additional parameter to canonicalization, see snippets::op::Subgraph::canonicalize for details.
-    const bool isBlockedApplicable = dnnl::impl::utils::one_of(ndims,  4, 5) && dimRanksAreEqual && !hasBroadcastByC();
+    //  canonicalization can't distinguish between <N, C, H, W, c> and <N, C, D, H, W> cases.
+    //  See snippets::op::Subgraph::canonicalize for details.
+    const bool isBlockedApplicable = dnnl::impl::utils::one_of(ndims,  4, 5) && dimRanksAreEqual;
    enum LayoutType {
        Planar,
        ChannelsFirst,
@@ -255,15 +236,17 @@ static size_t argmax_rank(const std::vector<MKLDNNEdgeWeakPtr> &childEdges) {
    return max_rank_idx;
 }

-static auto offset_calculation(std::vector<int64_t>& offset, const std::vector<int64_t>& dims_in, const std::vector<int64_t>& dims_out) -> void {
-    int k = 1;
+static void offset_calculation(std::vector<size_t>& offset, const std::vector<size_t>& dims_in, const std::vector<size_t>& dims_out) {
+    size_t k = 1;
    for (int i = offset.size() - 1; i >= 0; i--) {
        offset[i] = (dims_in[i] == dims_out[i]) ? k : 0;
        k *= dims_in[i];
    }
 }

-static auto collapseLastDims(std::vector<int64_t>& dims, int dimsToCollapse) -> void {
+static auto collapseLastDims(std::vector<size_t>& dims, size_t dimsToCollapse) -> void {
+    if (dimsToCollapse >= dims.size() - 1)
+        IE_THROW() << "Got invalid number of dims to collapse. Expected < " << dims.size() - 1 << " got " << dimsToCollapse;
    for (int i = dims.size() - 2; i > dims.size() - dimsToCollapse - 2; i--) {
        dims[dims.size() - 1] *= dims[i];
    }
@@ -278,63 +261,51 @@ static auto collapseLastDims(std::vector<int64_t>& dims, int dimsToCollapse) ->
 }

 void MKLDNNSnippetNode::define_schedule() {
+    auto edgeToBlockedShape = [](const MKLDNNEdgePtr& edge) {
+        const auto blockedDesc = edge->getMemory().GetDescWithType<BlockedMemoryDesc>();
+        ngraph::Shape shape(blockedDesc->getBlockDims());
+        ngraph::AxisVector blocking(blockedDesc->getOrder());
+        ngraph::element::Type precision = InferenceEngine::details::convertPrecision(blockedDesc->getPrecision());
+        return ngraph::snippets::op::Subgraph::BlockedShape{shape, blocking, precision};
+    };
+    auto prependWithOnes = [this](const std::vector<size_t>& dims) {
+        if (tensorRank <= dims.size())
+            return dims;
+        VectorDims result(tensorRank, 1);
+        std::copy(dims.begin(), dims.end(), &result[tensorRank - dims.size()]);
+        return result;
+    };
+    ngraph::snippets::op::Subgraph::BlockedShapeVector input_blocked_shapes;
+    for (size_t i = 0; i < inputShapes.size(); i++)
+        input_blocked_shapes.push_back(edgeToBlockedShape(getParentEdgesAtPort(i)[0]));
+
+    ngraph::snippets::op::Subgraph::BlockedShapeVector output_blocked_shapes;
+    for (size_t i = 0; i < outputShapes.size(); i++)
+        output_blocked_shapes.push_back(edgeToBlockedShape(getChildEdgesAtPort(i)[0]));
+    exec_domain = snippet->canonicalize(output_blocked_shapes, input_blocked_shapes);
+    // initialize by maximum output dimension. Dimensions of outputs should be broadcastable
+    tensorRank = std::max(static_cast<size_t>(rank6D), exec_domain.size());
+    // Canonicalization broadcasts inputs and outputs to max input rank, which can be smaller than tensorRank
+    // prepend to enable 6D scheduler
+    exec_domain = prependWithOnes(exec_domain);
+    const auto &body = snippet->get_body();
+    for (const auto& p : body->get_parameters()) {
+        dims_in.emplace_back(prependWithOnes(p->get_shape()));
+    }
+
+    for (size_t i = 0; i < body->get_output_size(); i++) {
+        dims_out.push_back(prependWithOnes(body->get_output_shape(i)));
+    }
+
    const auto config = getSelectedPrimitiveDescriptor()->getConfig();
    const auto dataSize = config.inConfs[0].getMemDesc()->getPrecision().size();
-    // store to use as an execution domain
-    max_rank_out_desc_idx = argmax_rank(getChildEdges());
-    const auto outBlockingDesc_maxRank = getChildEdgeAt(max_rank_out_desc_idx)->getMemory().GetDescWithType<BlockedMemoryDesc>();
-    // initialize by maximum output dimension. Dimensions of outputs should be broadcastable
-    tensorRank = std::max(static_cast<size_t>(rank6D), outBlockingDesc_maxRank->getBlockDims().size());
-
-    auto initDims = [this, config, &outBlockingDesc_maxRank](size_t tensorRank) {
-        // assume all input sizes are even
-        const size_t inputNum = getParentEdges().size();
-
-        dims_in.resize(inputNum);
-        for (size_t i = 0; i < inputNum; i++) {
-            dims_in[i].resize(tensorRank, 1);
-        }
-
-        const auto outOrder = outBlockingDesc_maxRank->getOrder();
-        for (size_t i = 0; i < inputNum; i++) {
-            auto inBlockingDesc = getParentEdgeAt(i)->getMemory().GetDescWithType<BlockedMemoryDesc>();
-            size_t rank = inBlockingDesc->getBlockDims().size();
-
-            // WA to normalize blocked and planar layouts
-            // not actual thought, since [§] doesn't support mixed layouts yet
-            auto inOrder = inBlockingDesc->getOrder();
-            size_t startOff = outOrder.size() != outBlockingDesc_maxRank->getShape().getRank() &&
-                              outOrder.back() != inOrder.back() ? 1 : 0;
-            for (size_t j = 0; j < rank; j++) {
-                dims_in[i][dims_in[i].size() - 1 - j - startOff] = inBlockingDesc->getBlockDims()[rank - 1 - j];
-            }
-        }
-
-        // assume all output sizes are even
-        const size_t outputNum = config.outConfs.size();
-
-        dims_out.resize(outputNum);
-        for (size_t i = 0; i < outputNum; i++) {
-            dims_out[i].resize(tensorRank, 1);
-        }
-
-        for (size_t i = 0; i < outputNum; i++) {
-            auto outBlockingDesc = getChildEdgeAt(i)->getMemory().GetDescWithType<BlockedMemoryDesc>();
-            size_t rank = outBlockingDesc->getBlockDims().size();
-
-            for (size_t j = 0; j < rank; j++) {
-                dims_out[i][dims_out[i].size() - 1 - j] = outBlockingDesc->getBlockDims()[rank - 1 - j];
-            }
-        }
-    };
-
-    auto initOffsets = [this, config, dataSize](size_t tensorRank) {
+    auto initOffsets = [this, config, dataSize]() {
        // find max rank input among all outputs
        const size_t inputNum = getParentEdges().size();
        offsets_in.resize(inputNum);
        for (size_t i = 0; i < inputNum; i++) {
            offsets_in[i].resize(tensorRank, 1);
-            offset_calculation(offsets_in[i], dims_in[i], dims_out[max_rank_out_desc_idx]);
+            offset_calculation(offsets_in[i], dims_in[i], exec_domain);
            for (size_t j = 0; j < tensorRank; j++) {
                offsets_in[i][j] *= dataSize;
            }
@@ -352,7 +323,7 @@ void MKLDNNSnippetNode::define_schedule() {
        offsets_out.resize(outputNum);
        for (size_t i = 0; i < outputNum; i++) {
            offsets_out[i].resize(tensorRank, 1);
-            offset_calculation(offsets_out[i], dims_out[i], dims_out[max_rank_out_desc_idx]);
+            offset_calculation(offsets_out[i], dims_out[i], exec_domain);
            for (size_t j = 0; j < tensorRank; j++) {
                offsets_out[i][j] *= dataSize;
            }
@@ -367,13 +338,13 @@ void MKLDNNSnippetNode::define_schedule() {
        }
    };

-    auto find_dims_to_collapse = [this, config, &outBlockingDesc_maxRank]() -> int {
+    auto find_dims_to_collapse = [this, config]() -> int {
        int collapsedDims = 0;
        size_t minimalConcurrency = parallel_get_max_threads();
        size_t minimalJitWorkAmount = 256;
-        size_t currentJitWorkAmount = dims_out[max_rank_out_desc_idx].back();
+        size_t currentJitWorkAmount = exec_domain.back();
        while (currentJitWorkAmount < minimalJitWorkAmount && currentJitWorkAmount < fullWorkAmount) {
-            if (static_cast<int>(dims_out[max_rank_out_desc_idx].size()) - collapsedDims - 2 < 0)
+            if (static_cast<int>(exec_domain.size()) - collapsedDims - 2 < 0)
                break;

            bool canCollapse = true;
@@ -385,7 +356,7 @@ void MKLDNNSnippetNode::define_schedule() {
                }
            }

-            size_t nextJitWorkAmount = currentJitWorkAmount * dims_out[max_rank_out_desc_idx][dims_out[max_rank_out_desc_idx].size() - 2];
+            size_t nextJitWorkAmount = currentJitWorkAmount * exec_domain[exec_domain.size() - 2];
            if (fullWorkAmount / nextJitWorkAmount >= minimalConcurrency) {
                currentJitWorkAmount = nextJitWorkAmount;
                // if we cannot use dim collapsing we should use tile2D
@@ -399,13 +370,13 @@ void MKLDNNSnippetNode::define_schedule() {
                }

                collapsedDims++;
-                for (size_t i = 0; i < dims_in.size(); i++) {
-                    collapseLastDims(dims_in[i], 1);
-                }
+                for (auto &d : dims_in)
+                    collapseLastDims(d, 1);

-                for (size_t i = 0; i < dims_out.size(); i++) {
-                    collapseLastDims(dims_out[i], 1);
-                }
+                for (auto &d : dims_out)
+                    collapseLastDims(d, 1);
+
+                collapseLastDims(exec_domain, 1);
            } else {
                break;
            }
@@ -413,23 +384,23 @@ void MKLDNNSnippetNode::define_schedule() {
        return collapsedDims;
    };

-    auto initSchedulingInfo = [this, dataSize](const size_t tensorRank) -> void {
+    auto initSchedulingInfo = [this, dataSize]() -> void {
        // initialize scheduling information
        sch_offsets_in.resize(offsets_in.size(), 0);
        sch_offsets_out.resize(offsets_out.size(), 0);
        sch_dims.resize(maxTileRank, 1);
-        sch_dims[maxTileRank-1] = dims_out[max_rank_out_desc_idx].back();
-        schedulerWorkAmount = fullWorkAmount / dims_out[max_rank_out_desc_idx].back();
+        sch_dims[maxTileRank-1] = exec_domain.back();
+        schedulerWorkAmount = fullWorkAmount / exec_domain.back();
        if (tileRank > 1) {
-            sch_dims[maxTileRank - tileRank] = dims_out[max_rank_out_desc_idx][tensorRank - 2];
-            schedulerWorkAmount /= dims_out[max_rank_out_desc_idx][tensorRank - 2];
-            dims_out[max_rank_out_desc_idx][tensorRank - 2] = 1;
+            sch_dims[maxTileRank - tileRank] = exec_domain[tensorRank - 2];
+            schedulerWorkAmount /= exec_domain[tensorRank - 2];
+            exec_domain[tensorRank - 2] = 1;

            // update offsets for tile 2D because loaders have ptr shifts in some cases and stores have always ptrs shifts
            for (size_t i = 0; i < offsets_in.size(); i++) {
                int64_t offset = offsets_in[i][tensorRank - 2];
                if ((offset > dataSize) || (offset == 0 && dims_in[i].back() != 1)) {
-                    sch_offsets_in[i] = offset - dims_out[max_rank_out_desc_idx].back() * dataSize;
+                    sch_offsets_in[i] = offset - exec_domain.back() * dataSize;
                } else if (offset == dataSize) {
                    sch_offsets_in[i] = offset;
                }
@@ -437,49 +408,27 @@ void MKLDNNSnippetNode::define_schedule() {

            for (size_t i = 0; i < offsets_out.size(); i++) {
                int64_t offset = offsets_out[i][tensorRank - 2];
-                sch_offsets_out[i] = offset - dims_out[max_rank_out_desc_idx].back() * dataSize;
+                sch_offsets_out[i] = offset - exec_domain.back() * dataSize;
            }
        }
    };

-    initDims(tensorRank);
-
    fullWorkAmount = 1;
-    for (size_t i = 0; i < dims_out[max_rank_out_desc_idx].size(); i++) {
-        fullWorkAmount *= dims_out[max_rank_out_desc_idx][i];
+    for (const auto &d : exec_domain) {
+        fullWorkAmount *= d;
    }

-    const int collapsedDims = find_dims_to_collapse();
-    batchDimIdx = tensorRank - outBlockingDesc_maxRank->getBlockDims().size() + collapsedDims;
+    batchDimIdx = tensorRank - exec_domain.size();
+    // Note that exec_domain can be modified inside find_dims_to_collapse() and/or initSchedulingInfo()
+    find_dims_to_collapse();

-    initOffsets(tensorRank);
-    initSchedulingInfo(tensorRank);
+    initOffsets();
+    initSchedulingInfo();
 }

 void MKLDNNSnippetNode::generate() {
-    std::vector<MKLDNNEdgePtr> input_first_row;
-    for (size_t i = 0; i < inputShapes.size(); i++)
-        input_first_row.push_back(getParentEdgesAtPort(i)[0]);
-
-    auto edgeToBlockedShape = [](const MKLDNNEdgePtr& edge) -> ngraph::snippets::op::Subgraph::BlockedShape {
-        const auto blockedDesc = edge->getMemory().GetDescWithType<BlockedMemoryDesc>();
-        ngraph::Shape shape(blockedDesc->getBlockDims());
-        ngraph::AxisVector blocking(blockedDesc->getOrder());
-        ngraph::element::Type precision = InferenceEngine::details::convertPrecision(blockedDesc->getPrecision());
-        return std::make_tuple(shape, blocking, precision);
-    };
-    ngraph::snippets::op::Subgraph::BlockedShapeVector input_blocked_shapes;
-    std::transform(input_first_row.begin(), input_first_row.end(), std::back_inserter(input_blocked_shapes), edgeToBlockedShape);
-
-    std::vector<MKLDNNEdgePtr> output_first_row;
-    for (size_t i = 0; i < outputShapes.size(); i++)
-        // Can it go with difference shape or precision to different edges? I assume no.
-        output_first_row.push_back(getChildEdgesAtPort(i)[0]);
-
-    ngraph::snippets::op::Subgraph::BlockedShapeVector output_blocked_shapes;
-    std::transform(output_first_row.begin(), output_first_row.end(), std::back_inserter(output_blocked_shapes), edgeToBlockedShape);
    jit_snippets_compile_args jcp;
-    jcp.output_dims = dims_out[max_rank_out_desc_idx];
+    jcp.output_dims = exec_domain;
    std::copy(sch_dims.begin(), sch_dims.end(), jcp.scheduler_dims);
    std::copy(sch_offsets_in.begin(), sch_offsets_in.end(), jcp.scheduler_offsets);
    std::copy(sch_offsets_out.begin(), sch_offsets_out.end(), &jcp.scheduler_offsets[sch_offsets_in.size()]);
@@ -496,11 +445,11 @@ void MKLDNNSnippetNode::generate() {
        auto b = offsets_out[i].begin();
        std::copy(b, b + harness_num_dims, &jcp.data_offsets[(inputShapes.size() + i) * harness_num_dims]);
    }
-    schedule = snippet->generate(output_blocked_shapes, input_blocked_shapes, reinterpret_cast<void*>(&jcp));
+    schedule = snippet->generate(reinterpret_cast<void*>(&jcp));
 }

 void MKLDNNSnippetNode::schedule_6d(const jit_snippets_call_args& call_args) const {
-    const auto& dom = dims_out[max_rank_out_desc_idx];
+    const auto& dom = exec_domain;
    // < N, C, H, W > < 1, 1, N, C*H*W>
    parallel_for5d(dom[0], dom[1], dom[2], dom[3], dom[4],
        [&](int64_t d0, int64_t d1, int64_t d2, int64_t d3, int64_t d4) {
@@ -510,7 +459,7 @@ void MKLDNNSnippetNode::schedule_6d(const jit_snippets_call_args& call_args) con
 }

 void MKLDNNSnippetNode::schedule_nt(const jit_snippets_call_args& call_args) const {
-    const auto& work_size = dims_out[max_rank_out_desc_idx];
+    const auto& work_size = exec_domain;
    parallel_nt(0, [&](const int ithr, const int nthr) {
        size_t start = 0, end = 0;
        splitter(schedulerWorkAmount, nthr, ithr, start, end);
--- a/src/plugins/intel_cpu/src/nodes/subgraph.h
+++ b/src/plugins/intel_cpu/src/nodes/subgraph.h
@@ -61,7 +61,7 @@ private:

    // Holds index of output used as in execution domain
    // it should be compatible with a schedule's work size
-    size_t max_rank_out_desc_idx = 0;
+    std::vector<size_t> exec_domain = {};

    /// scheduling info
    size_t batchDimIdx = 0;
@@ -74,13 +74,13 @@ private:
    std::vector<MKLDNNMemoryPtr> srcMemPtrs = {};
    std::vector<MKLDNNMemoryPtr> dstMemPtrs = {};

-    std::vector<std::vector<int64_t>> dims_in = {};
-    std::vector<std::vector<int64_t>> offsets_in = {};
+    std::vector<std::vector<size_t>> dims_in = {};
+    std::vector<std::vector<size_t>> offsets_in = {};
    std::vector<ptrdiff_t> start_offset_in = {};
    std::vector<ptrdiff_t> start_offset_out = {};

-    std::vector<std::vector<int64_t>> dims_out = {};
-    std::vector<std::vector<int64_t>> offsets_out = {};
+    std::vector<std::vector<size_t>> dims_out = {};
+    std::vector<std::vector<size_t>> offsets_out = {};

    std::vector<int64_t> sch_dims = {};
    std::vector<int64_t> sch_offsets_in = {};