[SnippetS] Perf count nodes and emitters (#19493)

2023-12-05 19:49:27 +08:00 · 2023-12-05 19:49:27 +08:00 · f80793e420
commit f80793e420
parent 791762fb19
25 changed files with 781 additions and 113 deletions
--- a/src/common/snippets/include/snippets/generator.hpp
+++ b/src/common/snippets/include/snippets/generator.hpp
@ -29,7 +29,7 @@ class Generator;
 class LoweringResult {
    friend class Generator;
    // Some emitters rely on other precompiled kernels.
-    // We need to keep the pointers to such emitters alive, so the kernels would still be accessible at runtime.
+    // We need to keep the pointers to such emitters alive, so the kernels or nodes would still be accessible at runtime.
    std::vector<std::shared_ptr<Emitter>> m_saved_emitters{};
 public:
--- a/src/common/snippets/include/snippets/lowered/linear_ir.hpp
+++ b/src/common/snippets/include/snippets/lowered/linear_ir.hpp
@ -14,6 +14,18 @@ namespace ov {
 namespace snippets {
 namespace lowered {
 // Snippets performance count mode
 // Disabled - default, w/o perf count for snippets
 // Chrono - perf count with chrono call. This is a universal method, and support multi-thread case to output perf count data for each thread.
 // BackendSpecific - perf count provided by backend. This is for device specific requirment.
 // For example, in sake of more light overhead and more accurate result, x86 CPU specific mode via read RDTSC register is implemented,
 // which take ~50ns, while Chrono mode take 260ns for a pair of perf count start and perf count end execution, on ICX. This mode only support single thread.
 enum PerfCountMode {
    Disabled,
    Chrono,
    BackendSpecific,
 };
 class Config {
 public:
    // True if the lowered Emitters need to be accessed during runtime. Normally they're destroyed after code emission.
@ -21,6 +33,7 @@ public:
    // True if we should check runtime info for nodes to call specific needed transformations
    bool m_need_fill_tail_register = false;
    size_t m_loop_depth = 1;
    PerfCountMode perf_count_mode = PerfCountMode::Disabled;
    // Some Subgraphs doesn't support domain optimization due to operations' semantics
    bool m_enable_domain_optimization = false;
    // Minimal advised work amount for parallel execution.
--- a/src/common/snippets/include/snippets/lowered/pass/insert_perf_count.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/insert_perf_count.hpp
@ -0,0 +1,33 @@
 // Copyright (C) 2018-2023 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #pragma once
 #include "pass.hpp"
 #include "snippets/op/perf_count.hpp"
 namespace ov {
 namespace snippets {
 namespace lowered {
 namespace pass {
 /**
 * @interface InsertPerfCount
 * @brief Insert PerfCountBegin node after last parameter and insert PerfCountEnd node before first result.
 *  This is a illustration transformation to enable perf count in snippets.
 *  Developers could modify this to insert perf count pairs around interested sequence of nodes.
 * @ingroup snippets
 */
 class InsertPerfCount: public Pass {
 public:
    OPENVINO_RTTI("InsertPerfCount", "Pass")
    InsertPerfCount() = default;
    bool run(LinearIR& linear_ir) override;
 };
 } // namespace pass
 } // namespace lowered
 } // namespace snippets
 } // namespace ov
--- a/src/common/snippets/include/snippets/op/perf_count.hpp
+++ b/src/common/snippets/include/snippets/op/perf_count.hpp
@ -0,0 +1,93 @@
 // Copyright (C) 2018-2023 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #pragma once
 #include "openvino/op/op.hpp"
 #include "openvino/runtime/threading/thread_local.hpp"
 namespace ov {
 namespace snippets {
 namespace op {
 /**
 * @interface PerfCountBeginBase
 * @brief Base class for PerfCountBegin and PerfCountRdtscBegin(cpu)
 * @ingroup snippets
 */
 class PerfCountBeginBase : public ov::op::Op {
 public:
    OPENVINO_OP("PerfCountBeginBase", "SnippetsOpset");
    PerfCountBeginBase(const std::vector<Output<Node>>& args);
    PerfCountBeginBase() = default;
    void validate_and_infer_types() override;
    bool visit_attributes(AttributeVisitor& visitor) override;
 protected:
    void validate_and_infer_types_except_PerfCountEnd();
 };
 /**
 * @interface PerfCountEndBase
 * @brief Base class for PerfCountEnd and PerfCountRdtscEnd
 * @ingroup snippets
 */
 class PerfCountEndBase : public ov::op::Op {
 public:
    OPENVINO_OP("PerfCountEndBase", "SnippetsOpset");
    PerfCountEndBase(const std::vector<Output<Node>>& args);
    PerfCountEndBase() = default;
    void validate_and_infer_types() override;
    bool visit_attributes(AttributeVisitor& visitor) override;
 };
 /**
 * @interface PerfCountBegin
 * @brief Performance count start time with chrono call
 * @ingroup snippets
 */
 class PerfCountBegin : public PerfCountBeginBase {
 public:
    OPENVINO_OP("PerfCountBegin", "SnippetsOpset", PerfCountBeginBase);
    PerfCountBegin();
    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override;
    void set_start_time();
    std::chrono::high_resolution_clock::time_point& get_start_time();
 private:
    ov::threading::ThreadLocal<std::chrono::high_resolution_clock::time_point> start_time_stamp;
 };
 /**
 * @interface PerfCountEnd
 * @brief Performance count end time and duration with chrono call
 * @ingroup snippets
 */
 class PerfCountEnd : public PerfCountEndBase {
 public:
    OPENVINO_OP("PerfCountEnd", "SnippetsOpset", PerfCountEndBase);
    PerfCountEnd(const Output<Node>& pc_begin);
    PerfCountEnd() = default;
    ~PerfCountEnd() {
        output_perf_count();
    }
    void output_perf_count();
    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override;
    void init_pc_begin();
    void set_accumulated_time();
 private:
    ov::threading::ThreadLocal<uint64_t> accumulation;
    ov::threading::ThreadLocal<uint32_t> iteration;
    std::shared_ptr<PerfCountBegin> m_pc_begin = nullptr;
 };
 } // namespace op
 } // namespace snippets
 } // namespace ov
--- a/src/common/snippets/include/snippets/snippets_isa.hpp
+++ b/src/common/snippets/include/snippets/snippets_isa.hpp
@ -25,6 +25,7 @@
 #include "op/brgemm.hpp"
 #include "op/vector_buffer.hpp"
 #include "op/rank_normalization.hpp"
 #include "op/perf_count.hpp"
 namespace ov {
 namespace snippets {
--- a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp
+++ b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp
@ -24,6 +24,9 @@ OV_OP(Scalar, ov::snippets::op)
 OV_OP(Nop, ov::snippets::op)
 OV_OP(RankNormalization, ov::snippets::op)
 OV_OP(PerfCountBegin, ov::snippets::op)
 OV_OP(PerfCountEnd, ov::snippets::op)
 // Layout-oblivious from opset1
 // opset completeness
--- a/src/common/snippets/src/generator.cpp
+++ b/src/common/snippets/src/generator.cpp
@ -44,7 +44,8 @@ void Generator::generate(lowered::LinearIR& linear_ir, LoweringResult& result, c
    }
    OV_ITT_TASK_NEXT(GENERATE, "::GetSnippet")
-    // Note: some emitters use precompiled kernels. They need to be saved, so the kernels are accessible at runtime.
+    // 1. some emitters use precompiled kernels. They need to be saved, so the kernels are accessible at runtime.
    // 2. perf count node as field of emitter should be alive at runtime.
    if (linear_ir.get_config().m_save_expressions) {
        for (const auto& expr : linear_ir) {
            const auto& emitter = expr->get_emitter();
@ -66,7 +67,9 @@ Generator::opRegType Generator::get_op_reg_type(const std::shared_ptr<Node>& op)
        std::dynamic_pointer_cast<op::LoopEnd>(op) ||
        std::dynamic_pointer_cast<op::Brgemm>(op) ||
        std::dynamic_pointer_cast<op::Buffer>(op) ||
-        std::dynamic_pointer_cast<op::RankNormalization>(op))
+        std::dynamic_pointer_cast<op::RankNormalization>(op) ||
        std::dynamic_pointer_cast<op::PerfCountBeginBase>(op) ||
        std::dynamic_pointer_cast<op::PerfCountEndBase>(op))
        return gpr2gpr;
    else if (std::dynamic_pointer_cast<snippets::op::Load>(op) ||
             std::dynamic_pointer_cast<snippets::op::BroadcastLoad>(op))
--- a/src/common/snippets/src/lowered/pass/insert_perf_count.cpp
+++ b/src/common/snippets/src/lowered/pass/insert_perf_count.cpp
@ -0,0 +1,62 @@
 // Copyright (C) 2018-2023 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "snippets/lowered/pass/insert_perf_count.hpp"
 #include "snippets/lowered/linear_ir.hpp"
 #include "snippets/snippets_isa.hpp"
 #include "snippets/itt.hpp"
 namespace ov {
 namespace snippets {
 namespace lowered {
 namespace pass {
 bool InsertPerfCount::run(LinearIR& linear_ir) {
    OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InsertPerfCount")
    if (linear_ir.empty())
        return false;
    auto is_parameter = [](const std::shared_ptr<ov::Node>& node) {
        return ov::is_type<ov::op::v0::Parameter>(node);
    };
    auto is_result = [](const std::shared_ptr<ov::Node>& node) {
        return ov::is_type<ov::op::v0::Result>(node);
    };
    // mark perf_count_begin and perf_count_end position
    auto perf_count_begin_pos = linear_ir.cbegin();
    auto perf_count_end_pos = perf_count_begin_pos;
    bool first_result_marked = false;
    for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); expr_it++) {
        const auto expr = *expr_it;
        const auto& node = expr->get_node();
        if (is_parameter(node))
            perf_count_begin_pos = expr_it;
        if (is_result(node) && !first_result_marked) {
            perf_count_end_pos = expr_it;
            first_result_marked = true;
        }
    }
    // insert perf_count_begin after last parameter
    // linear_ir.insert has insert before behavior, need move to next.
    perf_count_begin_pos = std::next(perf_count_begin_pos);
    const auto& perf_count_begin = std::make_shared<op::PerfCountBegin>();
    const auto& perf_count_begin_expr = linear_ir.create_expression(perf_count_begin, std::vector<PortConnectorPtr>{});
    linear_ir.insert(perf_count_begin_pos, perf_count_begin_expr);
    // insert perf_count_end before first result
    const auto& perf_count_end = std::make_shared<op::PerfCountEnd>(perf_count_begin->output(0));
    perf_count_end->set_friendly_name("last_parameter_to_first_result");
    const auto& perf_count_end_expr = linear_ir.create_expression(perf_count_end, std::vector<PortConnectorPtr>{});
    linear_ir.insert(perf_count_end_pos, perf_count_end_expr);
    return true;
 }
 } // namespace pass
 } // namespace lowered
 } // namespace snippets
 } // namespace ov
--- a/src/common/snippets/src/op/perf_count.cpp
+++ b/src/common/snippets/src/op/perf_count.cpp
@ -0,0 +1,115 @@
 // Copyright (C) 2018-2023 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "snippets/op/perf_count.hpp"
 namespace ov {
 namespace snippets {
 namespace op {
 /////////////////PerfCountBeginBase/////////////////
 PerfCountBeginBase::PerfCountBeginBase(const std::vector<Output<Node>>& args) : Op() {}
 void PerfCountBeginBase::validate_and_infer_types() {
    validate_and_infer_types_except_PerfCountEnd();
    OPENVINO_ASSERT(get_output_size() == 1, "PerfCountBegin must have only one output");
    const auto& last_output_inputs = get_output_target_inputs(0);
    OPENVINO_ASSERT(last_output_inputs.size() == 1, "PerfCountBegin must have exactly one input attached to the last output");
    const auto& pc_end = ov::as_type_ptr<PerfCountEndBase>(last_output_inputs.begin()->get_node()->shared_from_this());
    OPENVINO_ASSERT(pc_end != nullptr, "PerfCountBegin must have PerfCountEnd connected to its last output");
 }
 bool PerfCountBeginBase::visit_attributes(AttributeVisitor &visitor) {
    return true;
 }
 void PerfCountBeginBase::validate_and_infer_types_except_PerfCountEnd() {
    NODE_VALIDATION_CHECK(this, get_input_size() == 0, "PerfCountBegin doesn't expect any inputs");
    set_output_type(0, element::f32, {});
 }
 //////////////////PerfCountEndBase/////////////////
 PerfCountEndBase::PerfCountEndBase(const std::vector<Output<Node>> &args) : Op(args) {}
 void PerfCountEndBase::validate_and_infer_types() {
    NODE_VALIDATION_CHECK(this, get_input_size() == 1, "PerfCountEndBase must have one input");
    const auto& pc_begin = ov::as_type_ptr<PerfCountBeginBase>(get_input_node_shared_ptr(0));
    NODE_VALIDATION_CHECK(this, pc_begin != nullptr, "PerfCountEndBase must have PerfCountBeginBase as the last argument");
    set_output_type(0, element::f32, {});
 }
 bool PerfCountEndBase::visit_attributes(AttributeVisitor &visitor) {
    return true;
 }
 /////////////////PerfCountBegin/////////////////
 PerfCountBegin::PerfCountBegin() : PerfCountBeginBase() {
    validate_and_infer_types_except_PerfCountEnd();
 }
 std::shared_ptr<Node> PerfCountBegin::clone_with_new_inputs(const OutputVector& inputs) const {
    return std::make_shared<PerfCountBegin>();
 }
 std::chrono::high_resolution_clock::time_point& PerfCountBegin::get_start_time() {
    return start_time_stamp.local();
 }
 void PerfCountBegin::set_start_time() {
    start_time_stamp.local() = std::chrono::high_resolution_clock::now();
 }
 //////////////////PerfCountEnd///////////////
 PerfCountEnd::PerfCountEnd(const Output<Node>& pc_begin) : PerfCountEndBase({pc_begin}), accumulation(0ul), iteration(0u) {
    constructor_validate_and_infer_types();
    init_pc_begin();
 }
 std::shared_ptr<Node> PerfCountEnd::clone_with_new_inputs(const OutputVector& inputs) const {
    return std::make_shared<PerfCountEnd>(inputs.at(0));
 }
 void PerfCountEnd::set_accumulated_time() {
    auto current_time = std::chrono::high_resolution_clock::now();
    auto& start_time = m_pc_begin->get_start_time();
    accumulation.local() += std::chrono::duration_cast<std::chrono::nanoseconds>(current_time - start_time).count();
    iteration.local()++;
 }
 void PerfCountEnd::init_pc_begin() {
    m_pc_begin = ov::as_type_ptr<PerfCountBegin>(get_input_source_output(get_input_size() - 1).get_node_shared_ptr());
    NODE_VALIDATION_CHECK(this, m_pc_begin != nullptr, "PerfCountEnd last input is not connected to PerfCountBegin");
 }
 void PerfCountEnd::output_perf_count() {
    OPENVINO_ASSERT(accumulation.size() == iteration.size(), "accumulation size should be the same as iteration size in perf_count_end node.");
    auto iterator_iter = iteration.begin();
    auto iterator_acc = accumulation.begin();
    int t_num = 0;
    uint64_t avg_max = 0;
    std::cout << "Perf count data in perfCountEnd node with name " << get_friendly_name() << " is:"<< std::endl;
    for (; iterator_iter != iteration.end(); ++iterator_iter, ++iterator_acc) {
        const auto iter = *iterator_iter;
        const auto acc = *iterator_acc;
        uint64_t avg = iter == 0 ? 0 : acc / iter;
        if (avg > avg_max)
            avg_max = avg;
        std::cout << "accumulated time:" << acc << "ns, iteration:" << iter << " avg time:" << avg << "ns"<< " on thread:" << t_num << std::endl;
        t_num++;
    }
    // max time of all threads: combine for reduce max
    auto BinaryFunc = [](const uint64_t& a, const uint64_t& b) {
        return a >= b ? a : b;
    };
    // max accumulation
    uint64_t acc_max = accumulation.combine(BinaryFunc);
    std::cout << "max accumulated time:" << acc_max << "ns" << std::endl;
    // max avg
    std::cout << "max avg time:" << avg_max << "ns" << std::endl;
 }
 } // namespace op
 } // namespace snippets
 } // namespace ov
--- a/src/common/snippets/src/op/subgraph.cpp
+++ b/src/common/snippets/src/op/subgraph.cpp
@ -42,6 +42,7 @@
 #include "snippets/lowered/pass/validate_loops.hpp"
 #include "snippets/lowered/pass/insert_loops.hpp"
 #include "snippets/lowered/pass/optimize_domain.hpp"
 #include "snippets/lowered/pass/insert_perf_count.hpp"
 #include "transformations/utils/utils.hpp"
@ -349,7 +350,8 @@ VectorDims Subgraph::infer_master_shape() {
 std::shared_ptr<lowered::LinearIR>
 Subgraph::convert_body_to_linear_ir(const std::shared_ptr<IShapeInferSnippetsFactory>& shape_infer_factory) {
    lowered::Config lowering_config;
-    lowering_config.m_save_expressions = config.m_has_domain_sensitive_ops;
+    lowering_config.m_save_expressions = config.m_has_domain_sensitive_ops ||
        (lowering_config.perf_count_mode != lowered::PerfCountMode::Disabled);
    lowering_config.m_need_fill_tail_register = config.m_has_domain_sensitive_ops;
    lowering_config.m_loop_depth = tileRank;
    lowering_config.m_enable_domain_optimization = !config.m_has_domain_sensitive_ops;
@ -487,6 +489,10 @@ snippets::Schedule Subgraph::generate_from_linear_ir(const lowered::pass::PassPi
    auto linear_ir {*m_linear_ir->clone()};
    LoweringResult lowering_result;
    control_flow_transformations(linear_ir, lowering_result, backend_passes_pre_common, backend_passes_post_common);
    if (linear_ir.get_config().perf_count_mode == lowered::PerfCountMode::Chrono) {
        lowered::pass::InsertPerfCount perf_count_pass;
        perf_count_pass.run(linear_ir);
    }
    m_generator->generate(linear_ir, lowering_result, compile_params);
    VectorDims parallel_exec_domain = linear_ir.get_master_shape();
--- a/src/common/snippets/src/shape_inference/shape_inference.cpp
+++ b/src/common/snippets/src/shape_inference/shape_inference.cpp
@ -55,6 +55,8 @@ const IShapeInferSnippetsFactory::TRegistry IShapeInferSnippetsFactory::registry
        SHAPE_INFER_PREDEFINED(op::Scalar, SingleElementShapeInfer),
        SHAPE_INFER_PREDEFINED(op::VectorBuffer, SingleElementShapeInfer),
        SHAPE_INFER_PREDEFINED(op::LoopEnd, EmptyShapeInfer),
        SHAPE_INFER_PREDEFINED(op::PerfCountBegin, EmptyShapeInfer),
        SHAPE_INFER_PREDEFINED(op::PerfCountEnd, EmptyShapeInfer),
        SHAPE_INFER_PREDEFINED(op::Kernel, EmptyShapeInfer),
        SHAPE_INFER_PREDEFINED(op::Nop, EmptyShapeInfer),
        SHAPE_INFER_OP_SPECIFIC_EXTERNAL(opset1::Select, SelectShapeInfer),
--- a/src/common/snippets/tests/src/lowering_utils.cpp
+++ b/src/common/snippets/tests/src/lowering_utils.cpp
@ -41,6 +41,8 @@ DummyTargetMachine::DummyTargetMachine(const std::vector<ov::Node::type_info_t>&
    jitters[ov::snippets::op::Kernel::get_type_info_static()] = dummy_functor;
    jitters[ov::snippets::op::LoopBegin::get_type_info_static()] = dummy_functor;
    jitters[ov::snippets::op::LoopEnd::get_type_info_static()] = dummy_functor;
    jitters[ov::snippets::op::PerfCountBegin::get_type_info_static()] = dummy_functor;
    jitters[ov::snippets::op::PerfCountEnd::get_type_info_static()] = dummy_functor;
    jitters[ov::snippets::op::Brgemm::get_type_info_static()] = dummy_functor;
    jitters[ov::snippets::op::Buffer::get_type_info_static()] = dummy_functor;
    jitters[ov::snippets::op::VectorBuffer::get_type_info_static()] = dummy_functor;
--- a/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp
+++ b/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp
@ -116,6 +116,20 @@ struct ThreadLocal {
    auto end() const -> Iterator<decltype(_map.end())> const {
        return {_map.end()};
    }
    // CombineFunc has signature T(T,T) or T(const T&, const T&)
    template <typename CombineFunc>
    T combine(CombineFunc f_combine) {
        if (begin() != end()) {
            auto ci = begin();
            T my_result = *ci;
            while (++ci != end())
                my_result = f_combine(my_result, *ci);
            return my_result;
        } else {
            return _create();
        }
    }
 };
 #endif
--- a/src/plugins/intel_cpu/src/emitters/x64/cpu_generator.cpp
+++ b/src/plugins/intel_cpu/src/emitters/x64/cpu_generator.cpp
@ -14,12 +14,15 @@
 #include "jit_dnnl_emitters.hpp"
 #include "jit_dnnl_ext_emitters.hpp"
 #include "jit_conversion_emitters.hpp"
 #include "jit_perf_count_chrono_emitters.hpp"
 #include "jit_perf_count_rdtsc_emitters.hpp"
 #include "transformations/snippets/x64/op/load_convert.hpp"
 #include "transformations/snippets/x64/op/store_convert.hpp"
 #include "transformations/snippets/x64/op/fused_mul_add.hpp"
 #include "transformations/snippets/x64/op/brgemm_copy_b.hpp"
 #include "transformations/snippets/x64/op/brgemm_cpu.hpp"
 #include "transformations/snippets/x64/op/perf_count_rdtsc.hpp"
 #include "transformations/cpu_opset/common/op/swish_cpu.hpp"
 #include "transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.hpp"
@ -157,6 +160,11 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho
    jitters[snippets::op::LoopEnd::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(LoopEndEmitter);
    jitters[intel_cpu::BrgemmCPU::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(BrgemmEmitter);
    jitters[intel_cpu::BrgemmCopyB::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(BrgemmCopyBEmitter);
    jitters[snippets::op::PerfCountBegin::get_type_info_static()] = CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_chrono_start_emitter);
    jitters[snippets::op::PerfCountEnd::get_type_info_static()] = CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_chrono_end_emitter);
    jitters[ov::intel_cpu::PerfCountRdtscBegin::get_type_info_static()] = CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_rdtsc_start_emitter);
    jitters[ov::intel_cpu::PerfCountRdtscEnd::get_type_info_static()] = CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_rdtsc_end_emitter);
 }
 size_t intel_cpu::CPUTargetMachine::get_lanes() const {
@ -224,6 +232,10 @@ snippets::Generator::opRegType intel_cpu::CPUGenerator::get_specific_op_reg_type
 }
 bool intel_cpu::CPUGenerator::uses_precompiled_kernel(const std::shared_ptr<snippets::Emitter>& e) const {
    return std::dynamic_pointer_cast<intel_cpu::BrgemmEmitter>(e) ||
-           std::dynamic_pointer_cast<intel_cpu::BrgemmCopyBEmitter>(e);
+           std::dynamic_pointer_cast<intel_cpu::BrgemmCopyBEmitter>(e) ||
           std::dynamic_pointer_cast<intel_cpu::jit_perf_count_chrono_start_emitter>(e) ||
           std::dynamic_pointer_cast<intel_cpu::jit_perf_count_chrono_end_emitter>(e) ||
           std::dynamic_pointer_cast<intel_cpu::jit_perf_count_rdtsc_start_emitter>(e) ||
           std::dynamic_pointer_cast<intel_cpu::jit_perf_count_rdtsc_end_emitter>(e);
 }
-} // namespace ov
+} // namespace ov
--- a/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.cpp
+++ b/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.cpp
@ -213,5 +213,73 @@ void jit_emitter::emit_code(const std::vector<size_t> &in_idxs, const std::vecto
    emitter_postamble();
 }
 void jit_emitter::internal_call_preamble() const {
    // gprs
    Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15,
                                        h->rax, h->rbx, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp};
    size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);
    h->sub(h->rsp, n_gprs_to_save * gpr_size);
    for (size_t i = 0; i < n_gprs_to_save; ++i)
        h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]);
    // mask regs
    // need preserve based on cpu capability, instead of host isa.
    // in case there are possibilty that different isa emitters exist in one subgraph KernelEmitter from perf standpoint in the future.
    // e.g. other emitters isa is avx512, while this emitter isa is avx2, and internal call is used. Internal call may use avx512 and spoil k-reg.
    // do not care about platform w/ avx512_common but w/o avx512_core(knight landing), which is obsoleted.
    if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)) {
        h->sub(h->rsp, k_mask_num * k_mask_size);
        for (size_t i = 0; i < k_mask_num; ++i) {
            h->kmovq(h->ptr[h->rsp + i * k_mask_size], Xbyak::Opmask(static_cast<int>(i)));
        }
    }
    // vector regs
    // 1. Caller obligation to save vector registers as callee may use them.
    // 2. There is an implicit assumption that the host code uses the same
    // `isa` as the injector. Once the assumption is wrong, `vecs_count` and
    // `vlen` should be replaced with `host_isa::vlen` and
    // `host_isa::vecs_count`.
    h->sub(h->rsp, get_max_vecs_count() * get_vec_length());
    for (size_t i = 0; i < get_max_vecs_count(); ++i) {
        push_vec(h->ptr[h->rsp + i * get_vec_length()], i);
    }
 }
 void jit_emitter::internal_call_postamble() const {
    // restore vector registers
    for (int i = static_cast<int>(get_max_vecs_count()) - 1; i >= 0; --i) {
        pop_vec(static_cast<size_t>(i), h->ptr[h->rsp + i * get_vec_length()]);
    }
    h->add(h->rsp, (get_max_vecs_count()) * get_vec_length());
    // restore k reg
    if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)) {
        for (int i = k_mask_num - 1; i >= 0; --i) {
            h->kmovq(Xbyak::Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
        }
        h->add(h->rsp, k_mask_num * k_mask_size);
    }
    // restore gpr registers
    Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15,
                                        h->rax, h->rbx, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp};
    size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);
    for (int i = n_gprs_to_save - 1; i >= 0; --i)
        h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]);
    h->add(h->rsp, n_gprs_to_save * gpr_size);
 }
 void jit_emitter::internal_call_rsp_align() const {
    h->mov(h->rbx, h->rsp);
    h->and_(h->rbx, 0xf);
    h->sub(h->rsp, h->rbx);
 }
 void jit_emitter::internal_call_rsp_restore() const {
    h->add(h->rsp, h->rbx);
 }
 }   // namespace intel_cpu
 }   // namespace ov
--- a/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.hpp
+++ b/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.hpp
@ -106,6 +106,8 @@ protected:
    mutable std::vector<size_t> aux_gpr_idxs;
    static constexpr int k_mask_size = 8;
    static constexpr int k_mask_num = 8;
    static constexpr int gpr_size = 8;
    Xbyak::Address table_val(std::string key, size_t key_off_val_shift = 0) const {
        auto off = table_off(key, key_off_val_shift);
@ -130,6 +132,13 @@ protected:
        }
    }
    void internal_call_preamble() const;
    void internal_call_postamble() const;
    // align stack on 16-byte as ABI reqiures
    // callee is responsible to save and restore rbx. rbx must not be changed after call callee.
    void internal_call_rsp_align() const;
    void internal_call_rsp_restore() const;
 private:
    mutable std::vector<size_t> preserved_vec_idxs;
    mutable std::vector<size_t> preserved_gpr_idxs;
--- a/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_chrono_emitters.cpp
+++ b/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_chrono_emitters.cpp
@ -0,0 +1,73 @@
 // Copyright (C) 2018-2023 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "jit_emitter.hpp"
 #include "jit_perf_count_chrono_emitters.hpp"
 #include <cpu/x64/jit_generator.hpp>
 using namespace dnnl::impl;
 using namespace dnnl::impl::utils;
 using namespace dnnl::impl::cpu;
 using namespace dnnl::impl::cpu::x64;
 using namespace Xbyak;
 using namespace Xbyak::util;
 namespace ov {
 namespace intel_cpu {
 jit_perf_count_chrono_start_emitter::jit_perf_count_chrono_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
                                            const std::shared_ptr<ov::Node>& n) : jit_emitter(host, host_isa) {
    m_start_node = ov::as_type_ptr<snippets::op::PerfCountBegin>(n);
 }
 size_t jit_perf_count_chrono_start_emitter::get_inputs_num() const {
    return 0;
 }
 void jit_perf_count_chrono_start_emitter::set_start_time(snippets::op::PerfCountBegin* start_node) {
    start_node->set_start_time();
 }
 void jit_perf_count_chrono_start_emitter::emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const {
    internal_call_preamble();
    const auto &set_start_time_overload = static_cast<void (*)(snippets::op::PerfCountBegin*)>(set_start_time);
    h->mov(h->rax, reinterpret_cast<size_t>(set_start_time_overload));
    h->mov(abi_param1, reinterpret_cast<size_t>(m_start_node.get()));
    internal_call_rsp_align();
    h->call(h->rax);
    internal_call_rsp_restore();
    internal_call_postamble();
 }
 ///////////////////jit_perf_count_chrono_end_emitter////////////////////////////////////
 jit_perf_count_chrono_end_emitter::jit_perf_count_chrono_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
    const std::shared_ptr<ov::Node>& n) : jit_emitter(host, host_isa) {
    m_end_node = ov::as_type_ptr<snippets::op::PerfCountEnd>(n);
 }
 size_t jit_perf_count_chrono_end_emitter::get_inputs_num() const {
    return 0;
 }
 void jit_perf_count_chrono_end_emitter::set_accumulated_time(snippets::op::PerfCountEnd* end_node) {
    end_node->set_accumulated_time();
 }
 void jit_perf_count_chrono_end_emitter::emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const {
    internal_call_preamble();
    const auto &set_accumulated_time_overload = static_cast<void (*)(snippets::op::PerfCountEnd*)>(set_accumulated_time);
    h->mov(h->rax, reinterpret_cast<size_t>(set_accumulated_time_overload));
    h->mov(abi_param1, reinterpret_cast<size_t>(m_end_node.get()));
    internal_call_rsp_align();
    h->call(h->rax);
    internal_call_rsp_restore();
    internal_call_postamble();
 }
 }   // namespace intel_cpu
 }   // namespace ov
--- a/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_chrono_emitters.hpp
+++ b/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_chrono_emitters.hpp
@ -0,0 +1,40 @@
 // Copyright (C) 2018-2023 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #pragma once
 #include "jit_emitter.hpp"
 #include <cpu/x64/jit_generator.hpp>
 #include "snippets/op/perf_count.hpp"
 namespace ov {
 namespace intel_cpu {
 class jit_perf_count_chrono_start_emitter : public jit_emitter {
 public:
    jit_perf_count_chrono_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
                            const std::shared_ptr<ov::Node>& n);
    size_t get_inputs_num() const override;
 private:
    void emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const override;
    static void set_start_time(snippets::op::PerfCountBegin* start_node);
    std::shared_ptr<snippets::op::PerfCountBegin> m_start_node = nullptr;
 };
 class jit_perf_count_chrono_end_emitter : public jit_emitter {
 public:
    jit_perf_count_chrono_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
                            const std::shared_ptr<ov::Node>& n);
    size_t get_inputs_num() const override;
 private:
    void emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const override;
    static void set_accumulated_time(snippets::op::PerfCountEnd* end_node);
    std::shared_ptr<snippets::op::PerfCountEnd> m_end_node = nullptr;
 };
 }   // namespace intel_cpu
 }   // namespace ov
--- a/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_rdtsc_emitters.cpp
+++ b/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_rdtsc_emitters.cpp
@ -0,0 +1,86 @@
 // Copyright (C) 2018-2023 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "jit_emitter.hpp"
 #include "jit_perf_count_rdtsc_emitters.hpp"
 #include <cpu/x64/jit_generator.hpp>
 using namespace dnnl::impl;
 using namespace dnnl::impl::utils;
 using namespace dnnl::impl::cpu;
 using namespace dnnl::impl::cpu::x64;
 using namespace Xbyak;
 using namespace Xbyak::util;
 namespace ov {
 namespace intel_cpu {
 jit_perf_count_rdtsc_start_emitter::jit_perf_count_rdtsc_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
                                            const std::shared_ptr<ov::Node>& n) : jit_emitter(host, host_isa) {
    m_start_node = ov::as_type_ptr<ov::intel_cpu::PerfCountRdtscBegin>(n);
 }
 size_t jit_perf_count_rdtsc_start_emitter::get_inputs_num() const {
    return 0;
 }
 void jit_perf_count_rdtsc_start_emitter::emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const {
    h->push(h->rax);
    h->push(h->rdx);
    // The EDX register is loaded with the high-order 32 bits of the MSR and the EAX register is loaded with the low-order 32 bits.
    h->lfence();
    h->rdtsc();
    h->lfence();
    h->shl(h->rdx, 0x20);     // shift to higher half of rdx 0x20(32)
    h->or_(h->rdx, h->rax);   // rdx has current tsc
    h->mov(h->rax, reinterpret_cast<size_t>(&m_start_node->start_count));
    h->mov(qword[h->rax], h->rdx);
    h->pop(h->rdx);
    h->pop(h->rax);
 }
 ///////////////////jit_perf_count_rdtsc_end_emitter////////////////////////////////////
 jit_perf_count_rdtsc_end_emitter::jit_perf_count_rdtsc_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
    const std::shared_ptr<ov::Node>& n) : jit_emitter(host, host_isa) {
        m_end_node = ov::as_type_ptr<ov::intel_cpu::PerfCountRdtscEnd>(n);
 }
 size_t jit_perf_count_rdtsc_end_emitter::get_inputs_num() const {
    return 0;
 }
 void jit_perf_count_rdtsc_end_emitter::emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const {
    h->push(h->rax);
    h->push(h->rdx);
    h->lfence();
    h->rdtsc();
    h->lfence();
    h->shl(h->rdx, 0x20);
    h->or_(h->rdx, h->rax);  // rdx has current tsc
    // tsc duration
    h->mov(h->rax, reinterpret_cast<size_t>(&m_end_node->get_pc_begin()->start_count));
    h->sub(h->rdx, qword[h->rax]);  // rdx has tsc duration
    // accumulation = accumulation + tsc duration
    h->mov(h->rax, reinterpret_cast<size_t>(&m_end_node->accumulation));
    h->add(h->rdx, qword[h->rax]);
    h->mov(qword[h->rax], h->rdx);
    // iteration++
    h->mov(h->rax, reinterpret_cast<size_t>(&m_end_node->iteration));
    h->mov(h->rdx, qword[h->rax]);
    h->add(h->rdx, 0x01);
    h->mov(qword[h->rax], h->rdx);
    h->pop(h->rdx);
    h->pop(h->rax);
 }
 }   // namespace intel_cpu
 }   // namespace ov
--- a/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_rdtsc_emitters.hpp
+++ b/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_rdtsc_emitters.hpp
@ -0,0 +1,37 @@
 // Copyright (C) 2018-2023 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #pragma once
 #include "jit_emitter.hpp"
 #include <cpu/x64/jit_generator.hpp>
 #include "transformations/snippets/x64/op/perf_count_rdtsc.hpp"
 namespace ov {
 namespace intel_cpu {
 class jit_perf_count_rdtsc_start_emitter : public jit_emitter {
 public:
    jit_perf_count_rdtsc_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
                            const std::shared_ptr<ov::Node>& n);
    size_t get_inputs_num() const override;
 private:
    void emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const override;
    std::shared_ptr<ov::intel_cpu::PerfCountRdtscBegin> m_start_node = nullptr;
 };
 class jit_perf_count_rdtsc_end_emitter : public jit_emitter {
 public:
    jit_perf_count_rdtsc_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
                            const std::shared_ptr<ov::Node>& n);
    size_t get_inputs_num() const override;
 private:
    void emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const override;
    std::shared_ptr<ov::intel_cpu::PerfCountRdtscEnd> m_end_node = nullptr;
 };
 }   // namespace intel_cpu
 }   // namespace ov
--- a/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp
+++ b/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp
@ -26,10 +26,6 @@ using jit_generator = dnnl::impl::cpu::x64::jit_generator;
 using cpu_isa_t = dnnl::impl::cpu::x64::cpu_isa_t;
 using ExpressionPtr = ov::snippets::lowered::ExpressionPtr;
 namespace {
 constexpr size_t gpr_size = 8;
 } // namespace
 inline static void transform_idxs_to_regs(const std::vector<size_t>& idxs, std::vector<Reg64>& regs) {
    regs.resize(idxs.size());
    std::transform(idxs.begin(), idxs.end(), regs.begin(), [](size_t idx){return Reg64(static_cast<int>(idx));});
@ -1114,32 +1110,7 @@ void BrgemmEmitter::emit_brgemm_kernel_call(const brgemm_kernel_t *brg_kernel, c
        h->add(h->rsp, n_gprs_to_save * gpr_size);
    }
-    Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15,
+    internal_call_preamble();
                                     h->rax, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx};
    size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);
    h->sub(h->rsp, n_gprs_to_save * gpr_size);
    for (size_t i = 0; i < n_gprs_to_save; ++i)
        h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]);
    // caller obligation to save k-regs as callee may use them
    size_t n_k_regs_to_save = 8;
    h->sub(h->rsp, n_k_regs_to_save * k_mask_size);
    for (size_t i = 0; i < n_k_regs_to_save; ++i) {
        if (mayiuse(avx512_core))
            h->kmovq(h->ptr[h->rsp + i * k_mask_size], Opmask(static_cast<int>(i)));
        else
            h->kmovw(h->ptr[h->rsp + i * k_mask_size], Opmask(static_cast<int>(i)));
    }
    // 1. Caller obligation to save vector registers as callee may use them.
    // 2. There is an implicit assumption that the host code uses the same
    // `isa` as the injector. Once the assumption is wrong, `vecs_count` and
    // `vlen` should be replaced with `host_isa::vlen` and
    // `host_isa::vecs_count`.
    h->sub(h->rsp, get_max_vecs_count() * get_vec_length());
    for (size_t i = 0; i < get_max_vecs_count(); ++i)
        h->uni_vmovups(h->ptr[h->rsp + i * get_vec_length()], Zmm(i));
    // save function address in gpr to pass in call instruction
    const auto& brgemm_kernel_overload = static_cast<void (*)(const brgemm_kernel_t*,
@ -1193,38 +1164,15 @@ void BrgemmEmitter::emit_brgemm_kernel_call(const brgemm_kernel_t *brg_kernel, c
    h->mov(abi_param6, static_cast<int>(m_with_comp));
 #endif
-    // align stack on 16-byte as ABI requires
+    internal_call_rsp_align();
    // note that RBX must not be changed by the callee
    h->mov(h->rbx, h->rsp);
    h->and_(h->rbx, 0xf);
    h->sub(h->rsp, h->rbx);
    h->call(h->rbp);
-
+    internal_call_rsp_restore();
    h->add(h->rsp, h->rbx);
 #ifdef _WIN32
    h->add(h->rsp, num_args_passed_on_stack * gpr_size);
 #endif
    // restore vector registers
    for (int i = static_cast<int>(get_max_vecs_count()) - 1; i >= 0; --i) {
        h->uni_vmovups(Zmm(i), h->ptr[h->rsp + i * get_vec_length()]);
    }
    h->add(h->rsp, (get_max_vecs_count()) * get_vec_length());
-    // restore k registers
+    internal_call_postamble();
    for (int i = n_k_regs_to_save - 1; i >= 0; --i) {
        if (mayiuse(avx512_core))
            h->kmovq(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
        else
            h->kmovw(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
    }
    h->add(h->rsp, n_k_regs_to_save * k_mask_size);
    // restore gpr registers
    for (int i = n_gprs_to_save - 1; i >= 0; --i)
        h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]);
    h->add(h->rsp, n_gprs_to_save * gpr_size);
 }
 void BrgemmEmitter::kernel_execute(const brgemm_kernel_t *brg_kernel,
@ -1358,32 +1306,7 @@ void BrgemmCopyBEmitter::emit_impl(const std::vector<size_t>& in,
 void BrgemmCopyBEmitter::emit_kernel_call(const matmul::jit_brgemm_matmul_copy_b_t* kernel, Reg64 src, Reg64 dst, Reg64 comp,
                                          size_t N, size_t K, size_t offset_in, size_t offset_out, size_t offset_comp) const {
-    Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15,
+    internal_call_preamble();
                                     h->rax, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx};
    size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);
    h->sub(h->rsp, n_gprs_to_save * gpr_size);
    for (size_t i = 0; i < n_gprs_to_save; ++i)
        h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]);
    // caller obligation to save k-regs as callee may use them
    size_t n_k_regs_to_save = 8;
    h->sub(h->rsp, n_k_regs_to_save * k_mask_size);
    for (size_t i = 0; i < n_k_regs_to_save; ++i) {
        if (mayiuse(avx512_core))
            h->kmovq(h->ptr[h->rsp + i * k_mask_size], Opmask(static_cast<int>(i)));
        else
            h->kmovw(h->ptr[h->rsp + i * k_mask_size], Opmask(static_cast<int>(i)));
    }
    // 1. Caller obligation to save vector registers as callee may use them.
    // 2. There is an implicit assumption that the host code uses the same
    // `isa` as the injector. Once the assumption is wrong, `vecs_count` and
    // `vlen` should be replaced with `host_isa::vlen` and
    // `host_isa::vecs_count`.
    h->sub(h->rsp, get_max_vecs_count() * get_vec_length());
    for (size_t i = 0; i < get_max_vecs_count(); ++i)
        h->uni_vmovups(h->ptr[h->rsp + i * get_vec_length()], Zmm(i));
    const auto data_ptr = [&](Xmm xmm, Xbyak::Reg64 reg, size_t bytes_offset) {
        h->uni_vmovq(reg, xmm);
@ -1437,38 +1360,16 @@ void BrgemmCopyBEmitter::emit_kernel_call(const matmul::jit_brgemm_matmul_copy_b
    h->mov(abi_param5, N);
    h->mov(abi_param6, K);
 #endif
    // align stack on 16-byte as ABI requires
    // note that RBX must not be changed by the callee
    h->mov(h->rbx, h->rsp);
    h->and_(h->rbx, 0xf);
    h->sub(h->rsp, h->rbx);
    internal_call_rsp_align();
    h->call(h->rbp);
-
+    internal_call_rsp_restore();
    h->add(h->rsp, h->rbx);
 #ifdef _WIN32
        h->add(h->rsp, gpr_size * num_args_passed_on_stack);
 #endif
    // restore vector registers
    for (int i = static_cast<int>(get_max_vecs_count()) - 1; i >= 0; --i) {
        h->uni_vmovups(Zmm(i), h->ptr[h->rsp + i * get_vec_length()]);
    }
    h->add(h->rsp, (get_max_vecs_count()) * get_vec_length());
-    // restore k registers
+    internal_call_postamble();
    for (int i = n_k_regs_to_save - 1; i >= 0; --i) {
        if (mayiuse(avx512_core))
            h->kmovq(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
        else
            h->kmovw(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
    }
    h->add(h->rsp, n_k_regs_to_save * k_mask_size);
    // restore gpr registers
    for (int i = n_gprs_to_save - 1; i >= 0; --i)
        h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]);
    h->add(h->rsp, n_gprs_to_save * gpr_size);
 }
 void BrgemmCopyBEmitter::execute(matmul::jit_brgemm_matmul_copy_b_t *kernel, const void *src,
--- a/src/plugins/intel_cpu/src/extension.cpp
+++ b/src/plugins/intel_cpu/src/extension.cpp
@ -15,6 +15,7 @@
 #include "transformations/snippets/x64/op/store_convert.hpp"
 #include "transformations/snippets/x64/op/brgemm_cpu.hpp"
 #include "transformations/snippets/x64/op/brgemm_copy_b.hpp"
 #include "transformations/snippets/x64/op/perf_count_rdtsc.hpp"
 #include <ov_ops/augru_cell.hpp>
 #include <ov_ops/augru_sequence.hpp>
@ -159,12 +160,16 @@ std::map<std::string, ngraph::OpSet> Extension::getOpSets() {
        NGRAPH_OP(Subgraph, ov::snippets::op)
        NGRAPH_OP(VectorBuffer, ov::snippets::op)
        NGRAPH_OP(RankNormalization, ov::snippets::op)
        NGRAPH_OP(PerfCountBegin, ov::snippets::op)
        NGRAPH_OP(PerfCountEnd, ov::snippets::op)
        NGRAPH_OP_X64(LoadConvertSaturation, ov::intel_cpu)
        NGRAPH_OP_X64(LoadConvertTruncation, ov::intel_cpu)
        NGRAPH_OP_X64(StoreConvertSaturation, ov::intel_cpu)
        NGRAPH_OP_X64(StoreConvertTruncation, ov::intel_cpu)
        NGRAPH_OP_X64(BrgemmCPU, ov::intel_cpu)
        NGRAPH_OP_X64(BrgemmCopyB, ov::intel_cpu)
        NGRAPH_OP_X64(PerfCountRdtscBegin, ov::intel_cpu)
        NGRAPH_OP_X64(PerfCountRdtscEnd, ov::intel_cpu)
 #undef NGRAPH_OP
        return opset;
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/perf_count_rdtsc.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/perf_count_rdtsc.cpp
@ -0,0 +1,32 @@
 // Copyright (C) 2018-2023 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "perf_count_rdtsc.hpp"
 using namespace ov;
 using namespace ov::intel_cpu;
 /////////////////////////PerfCountRdtscBegin//////////////////////
 PerfCountRdtscBegin::PerfCountRdtscBegin() : PerfCountBeginBase() {
    validate_and_infer_types_except_PerfCountEnd();
 }
 std::shared_ptr<Node> PerfCountRdtscBegin::clone_with_new_inputs(const OutputVector& inputs) const {
    return std::make_shared<PerfCountRdtscBegin>();
 }
 /////////////////////////PerfCountRdtscEnd//////////////////////
 PerfCountRdtscEnd::PerfCountRdtscEnd(const Output<Node>& pc_begin) : ov::snippets::op::PerfCountEndBase({pc_begin}), accumulation(0ul), iteration(0u) {
    constructor_validate_and_infer_types();
 }
 std::shared_ptr<Node> PerfCountRdtscEnd::clone_with_new_inputs(const OutputVector& inputs) const {
    return std::make_shared<PerfCountRdtscEnd>(inputs.at(0));
 }
 std::shared_ptr<PerfCountRdtscBegin> PerfCountRdtscEnd::get_pc_begin() {
    const auto& pc_begin = ov::as_type_ptr<PerfCountRdtscBegin>(get_input_source_output(get_input_size() - 1).get_node_shared_ptr());
    OPENVINO_ASSERT(pc_begin != nullptr, "PerfCountRdtscEnd last input is not connected to PerfCountRdtscBegin");
    return  pc_begin;
 }
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/perf_count_rdtsc.hpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/perf_count_rdtsc.hpp
@ -0,0 +1,55 @@
 // Copyright (C) 2018-2023 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #pragma once
 #include "openvino/op/op.hpp"
 #include "snippets/op/perf_count.hpp"
 using namespace ov::snippets::op;
 namespace ov {
 namespace intel_cpu {
 /**
 * @interface PerfCountRdtscBegin
 * @brief Performance count start time via read rdtsc register
 * @ingroup snippets
 */
 class PerfCountRdtscBegin : public PerfCountBeginBase {
 public:
    OPENVINO_OP("PerfCountRdtscBegin", "SnippetsOpset", PerfCountBeginBase);
    PerfCountRdtscBegin();
    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override;
    uint64_t start_count = 0ul;
 };
 /**
 * @interface PerfCountRdtscEnd
 * @brief Performance count end time and duration
 * @ingroup snippets
 */
 class PerfCountRdtscEnd : public PerfCountEndBase {
 public:
    OPENVINO_OP("PerfCountRdtscEnd", "SnippetsOpset", PerfCountEndBase);
    PerfCountRdtscEnd(const Output<Node>& pc_begin);
    PerfCountRdtscEnd() = default;
    ~PerfCountRdtscEnd() {
        uint64_t avg = iteration == 0 ? 0 : accumulation / iteration;
        std::cout << "accumulation:" << accumulation << " iteration:" << iteration << " avg:" << avg << std::endl;
    }
    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override;
    std::shared_ptr<PerfCountRdtscBegin> get_pc_begin();
    // in each call, PerfCountRdtscBegin get start_count.
    // in each call, PerfCountRdtscEnd get end_count, then total_duration += end_count - start_count, and iteration++.
    // in destructor of PerfCountRdtscEnd, output the perf info
    // accumulation is cycle count
    uint64_t accumulation = 0ul;
    uint32_t iteration = 0u;
 };
 } // namespace intel_cpu
 } // namespace ov
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/shape_inference.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/shape_inference.cpp
@ -9,6 +9,7 @@
 #include "op/fused_mul_add.hpp"
 #include "op/load_convert.hpp"
 #include "op/store_convert.hpp"
 #include "op/perf_count_rdtsc.hpp"
 #include "transformations/cpu_opset/common/op/swish_cpu.hpp"
 namespace ov {
@ -38,6 +39,8 @@ const CPUShapeInferSnippetsFactory::TRegistry CPUShapeInferSnippetsFactory::spec
        SHAPE_INFER_PREDEFINED(ov::intel_cpu::LoadConvertTruncation, PassThroughShapeInfer),
        SHAPE_INFER_PREDEFINED(ov::intel_cpu::StoreConvertSaturation, PassThroughShapeInfer),
        SHAPE_INFER_PREDEFINED(ov::intel_cpu::StoreConvertTruncation, PassThroughShapeInfer),
        SHAPE_INFER_PREDEFINED(ov::intel_cpu::PerfCountRdtscBegin, EmptyShapeInfer),
        SHAPE_INFER_PREDEFINED(ov::intel_cpu::PerfCountRdtscEnd, EmptyShapeInfer),
        SHAPE_INFER_OP_SPECIFIC_EXTERNAL(ov::intel_cpu::BrgemmCPU, BrgemmShapeInfer),
        //
        SHAPE_INFER_OP_SPECIFIC(ov::intel_cpu::BrgemmCopyB),