[SnippetS] Perf count nodes and emitters (#19493)

2023-12-05 19:49:27 +08:00 · 2023-12-05 19:49:27 +08:00 · f80793e420
commit f80793e420
parent 791762fb19
25 changed files with 781 additions and 113 deletions
--- a/src/common/snippets/include/snippets/generator.hpp
+++ b/src/common/snippets/include/snippets/generator.hpp
@ -29,7 +29,7 @@ class Generator;
 class LoweringResult {
    friend class Generator;
    // Some emitters rely on other precompiled kernels.
-    // We need to keep the pointers to such emitters alive, so the kernels would still be accessible at runtime.
+    // We need to keep the pointers to such emitters alive, so the kernels or nodes would still be accessible at runtime.
    std::vector<std::shared_ptr<Emitter>> m_saved_emitters{};

 public:
--- a/src/common/snippets/include/snippets/lowered/linear_ir.hpp
+++ b/src/common/snippets/include/snippets/lowered/linear_ir.hpp
@ -14,6 +14,18 @@ namespace ov {
 namespace snippets {
 namespace lowered {

+// Snippets performance count mode
+// Disabled - default, w/o perf count for snippets
+// Chrono - perf count with chrono call. This is a universal method, and support multi-thread case to output perf count data for each thread.
+// BackendSpecific - perf count provided by backend. This is for device specific requirment.
+// For example, in sake of more light overhead and more accurate result, x86 CPU specific mode via read RDTSC register is implemented,
+// which take ~50ns, while Chrono mode take 260ns for a pair of perf count start and perf count end execution, on ICX. This mode only support single thread.
+enum PerfCountMode {
+    Disabled,
+    Chrono,
+    BackendSpecific,
+};
+
 class Config {
 public:
    // True if the lowered Emitters need to be accessed during runtime. Normally they're destroyed after code emission.
@ -21,6 +33,7 @@ public:
    // True if we should check runtime info for nodes to call specific needed transformations
    bool m_need_fill_tail_register = false;
    size_t m_loop_depth = 1;
+    PerfCountMode perf_count_mode = PerfCountMode::Disabled;
    // Some Subgraphs doesn't support domain optimization due to operations' semantics
    bool m_enable_domain_optimization = false;
    // Minimal advised work amount for parallel execution.
--- a/src/common/snippets/include/snippets/lowered/pass/insert_perf_count.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/insert_perf_count.hpp
@ -0,0 +1,33 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "pass.hpp"
+
+#include "snippets/op/perf_count.hpp"
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+
+/**
+ * @interface InsertPerfCount
+ * @brief Insert PerfCountBegin node after last parameter and insert PerfCountEnd node before first result.
+ *  This is a illustration transformation to enable perf count in snippets.
+ *  Developers could modify this to insert perf count pairs around interested sequence of nodes.
+ * @ingroup snippets
+ */
+class InsertPerfCount: public Pass {
+public:
+    OPENVINO_RTTI("InsertPerfCount", "Pass")
+    InsertPerfCount() = default;
+    bool run(LinearIR& linear_ir) override;
+};
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
--- a/src/common/snippets/include/snippets/op/perf_count.hpp
+++ b/src/common/snippets/include/snippets/op/perf_count.hpp
@ -0,0 +1,93 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/op/op.hpp"
+#include "openvino/runtime/threading/thread_local.hpp"
+
+namespace ov {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface PerfCountBeginBase
+ * @brief Base class for PerfCountBegin and PerfCountRdtscBegin(cpu)
+ * @ingroup snippets
+ */
+class PerfCountBeginBase : public ov::op::Op {
+public:
+    OPENVINO_OP("PerfCountBeginBase", "SnippetsOpset");
+    PerfCountBeginBase(const std::vector<Output<Node>>& args);
+    PerfCountBeginBase() = default;
+
+    void validate_and_infer_types() override;
+    bool visit_attributes(AttributeVisitor& visitor) override;
+
+protected:
+    void validate_and_infer_types_except_PerfCountEnd();
+};
+
+/**
+ * @interface PerfCountEndBase
+ * @brief Base class for PerfCountEnd and PerfCountRdtscEnd
+ * @ingroup snippets
+ */
+class PerfCountEndBase : public ov::op::Op {
+public:
+    OPENVINO_OP("PerfCountEndBase", "SnippetsOpset");
+    PerfCountEndBase(const std::vector<Output<Node>>& args);
+    PerfCountEndBase() = default;
+
+    void validate_and_infer_types() override;
+    bool visit_attributes(AttributeVisitor& visitor) override;
+};
+
+/**
+ * @interface PerfCountBegin
+ * @brief Performance count start time with chrono call
+ * @ingroup snippets
+ */
+class PerfCountBegin : public PerfCountBeginBase {
+public:
+    OPENVINO_OP("PerfCountBegin", "SnippetsOpset", PerfCountBeginBase);
+    PerfCountBegin();
+
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override;
+
+    void set_start_time();
+    std::chrono::high_resolution_clock::time_point& get_start_time();
+
+private:
+    ov::threading::ThreadLocal<std::chrono::high_resolution_clock::time_point> start_time_stamp;
+};
+
+/**
+ * @interface PerfCountEnd
+ * @brief Performance count end time and duration with chrono call
+ * @ingroup snippets
+ */
+class PerfCountEnd : public PerfCountEndBase {
+public:
+    OPENVINO_OP("PerfCountEnd", "SnippetsOpset", PerfCountEndBase);
+    PerfCountEnd(const Output<Node>& pc_begin);
+    PerfCountEnd() = default;
+    ~PerfCountEnd() {
+        output_perf_count();
+    }
+    void output_perf_count();
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override;
+
+    void init_pc_begin();
+    void set_accumulated_time();
+
+private:
+    ov::threading::ThreadLocal<uint64_t> accumulation;
+    ov::threading::ThreadLocal<uint32_t> iteration;
+    std::shared_ptr<PerfCountBegin> m_pc_begin = nullptr;
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ov
--- a/src/common/snippets/include/snippets/snippets_isa.hpp
+++ b/src/common/snippets/include/snippets/snippets_isa.hpp
@ -25,6 +25,7 @@
 #include "op/brgemm.hpp"
 #include "op/vector_buffer.hpp"
 #include "op/rank_normalization.hpp"
+#include "op/perf_count.hpp"

 namespace ov {
 namespace snippets {
--- a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp
+++ b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp
@ -24,6 +24,9 @@ OV_OP(Scalar, ov::snippets::op)
 OV_OP(Nop, ov::snippets::op)
 OV_OP(RankNormalization, ov::snippets::op)

+OV_OP(PerfCountBegin, ov::snippets::op)
+OV_OP(PerfCountEnd, ov::snippets::op)
+
 // Layout-oblivious from opset1

 // opset completeness
--- a/src/common/snippets/src/generator.cpp
+++ b/src/common/snippets/src/generator.cpp
@ -44,7 +44,8 @@ void Generator::generate(lowered::LinearIR& linear_ir, LoweringResult& result, c
    }
    OV_ITT_TASK_NEXT(GENERATE, "::GetSnippet")

-    // Note: some emitters use precompiled kernels. They need to be saved, so the kernels are accessible at runtime.
+    // 1. some emitters use precompiled kernels. They need to be saved, so the kernels are accessible at runtime.
+    // 2. perf count node as field of emitter should be alive at runtime.
    if (linear_ir.get_config().m_save_expressions) {
        for (const auto& expr : linear_ir) {
            const auto& emitter = expr->get_emitter();
@ -66,7 +67,9 @@ Generator::opRegType Generator::get_op_reg_type(const std::shared_ptr<Node>& op)
        std::dynamic_pointer_cast<op::LoopEnd>(op) ||
        std::dynamic_pointer_cast<op::Brgemm>(op) ||
        std::dynamic_pointer_cast<op::Buffer>(op) ||
-        std::dynamic_pointer_cast<op::RankNormalization>(op))
+        std::dynamic_pointer_cast<op::RankNormalization>(op) ||
+        std::dynamic_pointer_cast<op::PerfCountBeginBase>(op) ||
+        std::dynamic_pointer_cast<op::PerfCountEndBase>(op))
        return gpr2gpr;
    else if (std::dynamic_pointer_cast<snippets::op::Load>(op) ||
             std::dynamic_pointer_cast<snippets::op::BroadcastLoad>(op))
--- a/src/common/snippets/src/lowered/pass/insert_perf_count.cpp
+++ b/src/common/snippets/src/lowered/pass/insert_perf_count.cpp
@ -0,0 +1,62 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/lowered/pass/insert_perf_count.hpp"
+#include "snippets/lowered/linear_ir.hpp"
+#include "snippets/snippets_isa.hpp"
+#include "snippets/itt.hpp"
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+
+bool InsertPerfCount::run(LinearIR& linear_ir) {
+    OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InsertPerfCount")
+    if (linear_ir.empty())
+        return false;
+
+    auto is_parameter = [](const std::shared_ptr<ov::Node>& node) {
+        return ov::is_type<ov::op::v0::Parameter>(node);
+    };
+    auto is_result = [](const std::shared_ptr<ov::Node>& node) {
+        return ov::is_type<ov::op::v0::Result>(node);
+    };
+
+    // mark perf_count_begin and perf_count_end position
+    auto perf_count_begin_pos = linear_ir.cbegin();
+    auto perf_count_end_pos = perf_count_begin_pos;
+    bool first_result_marked = false;
+    for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); expr_it++) {
+        const auto expr = *expr_it;
+        const auto& node = expr->get_node();
+        if (is_parameter(node))
+            perf_count_begin_pos = expr_it;
+
+        if (is_result(node) && !first_result_marked) {
+            perf_count_end_pos = expr_it;
+            first_result_marked = true;
+        }
+    }
+
+    // insert perf_count_begin after last parameter
+    // linear_ir.insert has insert before behavior, need move to next.
+    perf_count_begin_pos = std::next(perf_count_begin_pos);
+    const auto& perf_count_begin = std::make_shared<op::PerfCountBegin>();
+    const auto& perf_count_begin_expr = linear_ir.create_expression(perf_count_begin, std::vector<PortConnectorPtr>{});
+    linear_ir.insert(perf_count_begin_pos, perf_count_begin_expr);
+
+    // insert perf_count_end before first result
+    const auto& perf_count_end = std::make_shared<op::PerfCountEnd>(perf_count_begin->output(0));
+    perf_count_end->set_friendly_name("last_parameter_to_first_result");
+    const auto& perf_count_end_expr = linear_ir.create_expression(perf_count_end, std::vector<PortConnectorPtr>{});
+    linear_ir.insert(perf_count_end_pos, perf_count_end_expr);
+
+    return true;
+}
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
--- a/src/common/snippets/src/op/perf_count.cpp
+++ b/src/common/snippets/src/op/perf_count.cpp
@ -0,0 +1,115 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/op/perf_count.hpp"
+
+namespace ov {
+namespace snippets {
+namespace op {
+
+/////////////////PerfCountBeginBase/////////////////
+PerfCountBeginBase::PerfCountBeginBase(const std::vector<Output<Node>>& args) : Op() {}
+
+void PerfCountBeginBase::validate_and_infer_types() {
+    validate_and_infer_types_except_PerfCountEnd();
+    OPENVINO_ASSERT(get_output_size() == 1, "PerfCountBegin must have only one output");
+    const auto& last_output_inputs = get_output_target_inputs(0);
+    OPENVINO_ASSERT(last_output_inputs.size() == 1, "PerfCountBegin must have exactly one input attached to the last output");
+    const auto& pc_end = ov::as_type_ptr<PerfCountEndBase>(last_output_inputs.begin()->get_node()->shared_from_this());
+    OPENVINO_ASSERT(pc_end != nullptr, "PerfCountBegin must have PerfCountEnd connected to its last output");
+}
+
+bool PerfCountBeginBase::visit_attributes(AttributeVisitor &visitor) {
+    return true;
+}
+
+void PerfCountBeginBase::validate_and_infer_types_except_PerfCountEnd() {
+    NODE_VALIDATION_CHECK(this, get_input_size() == 0, "PerfCountBegin doesn't expect any inputs");
+    set_output_type(0, element::f32, {});
+}
+
+//////////////////PerfCountEndBase/////////////////
+PerfCountEndBase::PerfCountEndBase(const std::vector<Output<Node>> &args) : Op(args) {}
+
+void PerfCountEndBase::validate_and_infer_types() {
+    NODE_VALIDATION_CHECK(this, get_input_size() == 1, "PerfCountEndBase must have one input");
+    const auto& pc_begin = ov::as_type_ptr<PerfCountBeginBase>(get_input_node_shared_ptr(0));
+    NODE_VALIDATION_CHECK(this, pc_begin != nullptr, "PerfCountEndBase must have PerfCountBeginBase as the last argument");
+    set_output_type(0, element::f32, {});
+}
+
+bool PerfCountEndBase::visit_attributes(AttributeVisitor &visitor) {
+    return true;
+}
+
+/////////////////PerfCountBegin/////////////////
+PerfCountBegin::PerfCountBegin() : PerfCountBeginBase() {
+    validate_and_infer_types_except_PerfCountEnd();
+}
+
+std::shared_ptr<Node> PerfCountBegin::clone_with_new_inputs(const OutputVector& inputs) const {
+    return std::make_shared<PerfCountBegin>();
+}
+
+std::chrono::high_resolution_clock::time_point& PerfCountBegin::get_start_time() {
+    return start_time_stamp.local();
+}
+
+void PerfCountBegin::set_start_time() {
+    start_time_stamp.local() = std::chrono::high_resolution_clock::now();
+}
+
+//////////////////PerfCountEnd///////////////
+PerfCountEnd::PerfCountEnd(const Output<Node>& pc_begin) : PerfCountEndBase({pc_begin}), accumulation(0ul), iteration(0u) {
+    constructor_validate_and_infer_types();
+    init_pc_begin();
+}
+
+std::shared_ptr<Node> PerfCountEnd::clone_with_new_inputs(const OutputVector& inputs) const {
+    return std::make_shared<PerfCountEnd>(inputs.at(0));
+}
+
+void PerfCountEnd::set_accumulated_time() {
+    auto current_time = std::chrono::high_resolution_clock::now();
+    auto& start_time = m_pc_begin->get_start_time();
+    accumulation.local() += std::chrono::duration_cast<std::chrono::nanoseconds>(current_time - start_time).count();
+    iteration.local()++;
+}
+
+void PerfCountEnd::init_pc_begin() {
+    m_pc_begin = ov::as_type_ptr<PerfCountBegin>(get_input_source_output(get_input_size() - 1).get_node_shared_ptr());
+    NODE_VALIDATION_CHECK(this, m_pc_begin != nullptr, "PerfCountEnd last input is not connected to PerfCountBegin");
+}
+
+void PerfCountEnd::output_perf_count() {
+    OPENVINO_ASSERT(accumulation.size() == iteration.size(), "accumulation size should be the same as iteration size in perf_count_end node.");
+    auto iterator_iter = iteration.begin();
+    auto iterator_acc = accumulation.begin();
+    int t_num = 0;
+    uint64_t avg_max = 0;
+    std::cout << "Perf count data in perfCountEnd node with name " << get_friendly_name() << " is:"<< std::endl;
+    for (; iterator_iter != iteration.end(); ++iterator_iter, ++iterator_acc) {
+        const auto iter = *iterator_iter;
+        const auto acc = *iterator_acc;
+        uint64_t avg = iter == 0 ? 0 : acc / iter;
+        if (avg > avg_max)
+            avg_max = avg;
+        std::cout << "accumulated time:" << acc << "ns, iteration:" << iter << " avg time:" << avg << "ns"<< " on thread:" << t_num << std::endl;
+        t_num++;
+    }
+
+    // max time of all threads: combine for reduce max
+    auto BinaryFunc = [](const uint64_t& a, const uint64_t& b) {
+        return a >= b ? a : b;
+    };
+    // max accumulation
+    uint64_t acc_max = accumulation.combine(BinaryFunc);
+    std::cout << "max accumulated time:" << acc_max << "ns" << std::endl;
+    // max avg
+    std::cout << "max avg time:" << avg_max << "ns" << std::endl;
+}
+
+} // namespace op
+} // namespace snippets
+} // namespace ov
--- a/src/common/snippets/src/op/subgraph.cpp
+++ b/src/common/snippets/src/op/subgraph.cpp
@ -42,6 +42,7 @@
 #include "snippets/lowered/pass/validate_loops.hpp"
 #include "snippets/lowered/pass/insert_loops.hpp"
 #include "snippets/lowered/pass/optimize_domain.hpp"
+#include "snippets/lowered/pass/insert_perf_count.hpp"

 #include "transformations/utils/utils.hpp"

@ -349,7 +350,8 @@ VectorDims Subgraph::infer_master_shape() {
 std::shared_ptr<lowered::LinearIR>
 Subgraph::convert_body_to_linear_ir(const std::shared_ptr<IShapeInferSnippetsFactory>& shape_infer_factory) {
    lowered::Config lowering_config;
-    lowering_config.m_save_expressions = config.m_has_domain_sensitive_ops;
+    lowering_config.m_save_expressions = config.m_has_domain_sensitive_ops ||
+        (lowering_config.perf_count_mode != lowered::PerfCountMode::Disabled);
    lowering_config.m_need_fill_tail_register = config.m_has_domain_sensitive_ops;
    lowering_config.m_loop_depth = tileRank;
    lowering_config.m_enable_domain_optimization = !config.m_has_domain_sensitive_ops;
@ -487,6 +489,10 @@ snippets::Schedule Subgraph::generate_from_linear_ir(const lowered::pass::PassPi
    auto linear_ir {*m_linear_ir->clone()};
    LoweringResult lowering_result;
    control_flow_transformations(linear_ir, lowering_result, backend_passes_pre_common, backend_passes_post_common);
+    if (linear_ir.get_config().perf_count_mode == lowered::PerfCountMode::Chrono) {
+        lowered::pass::InsertPerfCount perf_count_pass;
+        perf_count_pass.run(linear_ir);
+    }
    m_generator->generate(linear_ir, lowering_result, compile_params);

    VectorDims parallel_exec_domain = linear_ir.get_master_shape();
--- a/src/common/snippets/src/shape_inference/shape_inference.cpp
+++ b/src/common/snippets/src/shape_inference/shape_inference.cpp
@ -55,6 +55,8 @@ const IShapeInferSnippetsFactory::TRegistry IShapeInferSnippetsFactory::registry
        SHAPE_INFER_PREDEFINED(op::Scalar, SingleElementShapeInfer),
        SHAPE_INFER_PREDEFINED(op::VectorBuffer, SingleElementShapeInfer),
        SHAPE_INFER_PREDEFINED(op::LoopEnd, EmptyShapeInfer),
+        SHAPE_INFER_PREDEFINED(op::PerfCountBegin, EmptyShapeInfer),
+        SHAPE_INFER_PREDEFINED(op::PerfCountEnd, EmptyShapeInfer),
        SHAPE_INFER_PREDEFINED(op::Kernel, EmptyShapeInfer),
        SHAPE_INFER_PREDEFINED(op::Nop, EmptyShapeInfer),
        SHAPE_INFER_OP_SPECIFIC_EXTERNAL(opset1::Select, SelectShapeInfer),
--- a/src/common/snippets/tests/src/lowering_utils.cpp
+++ b/src/common/snippets/tests/src/lowering_utils.cpp
@ -41,6 +41,8 @@ DummyTargetMachine::DummyTargetMachine(const std::vector<ov::Node::type_info_t>&
    jitters[ov::snippets::op::Kernel::get_type_info_static()] = dummy_functor;
    jitters[ov::snippets::op::LoopBegin::get_type_info_static()] = dummy_functor;
    jitters[ov::snippets::op::LoopEnd::get_type_info_static()] = dummy_functor;
+    jitters[ov::snippets::op::PerfCountBegin::get_type_info_static()] = dummy_functor;
+    jitters[ov::snippets::op::PerfCountEnd::get_type_info_static()] = dummy_functor;
    jitters[ov::snippets::op::Brgemm::get_type_info_static()] = dummy_functor;
    jitters[ov::snippets::op::Buffer::get_type_info_static()] = dummy_functor;
    jitters[ov::snippets::op::VectorBuffer::get_type_info_static()] = dummy_functor;
--- a/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp
+++ b/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp
@ -116,6 +116,20 @@ struct ThreadLocal {
    auto end() const -> Iterator<decltype(_map.end())> const {
        return {_map.end()};
    }
+
+    // CombineFunc has signature T(T,T) or T(const T&, const T&)
+    template <typename CombineFunc>
+    T combine(CombineFunc f_combine) {
+        if (begin() != end()) {
+            auto ci = begin();
+            T my_result = *ci;
+            while (++ci != end())
+                my_result = f_combine(my_result, *ci);
+            return my_result;
+        } else {
+            return _create();
+        }
+    }
 };

 #endif
--- a/src/plugins/intel_cpu/src/emitters/x64/cpu_generator.cpp
+++ b/src/plugins/intel_cpu/src/emitters/x64/cpu_generator.cpp
@ -14,12 +14,15 @@
 #include "jit_dnnl_emitters.hpp"
 #include "jit_dnnl_ext_emitters.hpp"
 #include "jit_conversion_emitters.hpp"
+#include "jit_perf_count_chrono_emitters.hpp"
+#include "jit_perf_count_rdtsc_emitters.hpp"

 #include "transformations/snippets/x64/op/load_convert.hpp"
 #include "transformations/snippets/x64/op/store_convert.hpp"
 #include "transformations/snippets/x64/op/fused_mul_add.hpp"
 #include "transformations/snippets/x64/op/brgemm_copy_b.hpp"
 #include "transformations/snippets/x64/op/brgemm_cpu.hpp"
+#include "transformations/snippets/x64/op/perf_count_rdtsc.hpp"
 #include "transformations/cpu_opset/common/op/swish_cpu.hpp"
 #include "transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.hpp"

@ -157,6 +160,11 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho
    jitters[snippets::op::LoopEnd::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(LoopEndEmitter);
    jitters[intel_cpu::BrgemmCPU::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(BrgemmEmitter);
    jitters[intel_cpu::BrgemmCopyB::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(BrgemmCopyBEmitter);
+
+    jitters[snippets::op::PerfCountBegin::get_type_info_static()] = CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_chrono_start_emitter);
+    jitters[snippets::op::PerfCountEnd::get_type_info_static()] = CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_chrono_end_emitter);
+    jitters[ov::intel_cpu::PerfCountRdtscBegin::get_type_info_static()] = CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_rdtsc_start_emitter);
+    jitters[ov::intel_cpu::PerfCountRdtscEnd::get_type_info_static()] = CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_rdtsc_end_emitter);
 }

 size_t intel_cpu::CPUTargetMachine::get_lanes() const {
@ -224,6 +232,10 @@ snippets::Generator::opRegType intel_cpu::CPUGenerator::get_specific_op_reg_type
 }
 bool intel_cpu::CPUGenerator::uses_precompiled_kernel(const std::shared_ptr<snippets::Emitter>& e) const {
    return std::dynamic_pointer_cast<intel_cpu::BrgemmEmitter>(e) ||
-           std::dynamic_pointer_cast<intel_cpu::BrgemmCopyBEmitter>(e);
+           std::dynamic_pointer_cast<intel_cpu::BrgemmCopyBEmitter>(e) ||
+           std::dynamic_pointer_cast<intel_cpu::jit_perf_count_chrono_start_emitter>(e) ||
+           std::dynamic_pointer_cast<intel_cpu::jit_perf_count_chrono_end_emitter>(e) ||
+           std::dynamic_pointer_cast<intel_cpu::jit_perf_count_rdtsc_start_emitter>(e) ||
+           std::dynamic_pointer_cast<intel_cpu::jit_perf_count_rdtsc_end_emitter>(e);
 }
-} // namespace ov
+} // namespace ov
--- a/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.cpp
+++ b/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.cpp
@ -213,5 +213,73 @@ void jit_emitter::emit_code(const std::vector<size_t> &in_idxs, const std::vecto
    emitter_postamble();
 }

+void jit_emitter::internal_call_preamble() const {
+    // gprs
+    Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15,
+                                        h->rax, h->rbx, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp};
+    size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);
+
+    h->sub(h->rsp, n_gprs_to_save * gpr_size);
+    for (size_t i = 0; i < n_gprs_to_save; ++i)
+        h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]);
+
+    // mask regs
+    // need preserve based on cpu capability, instead of host isa.
+    // in case there are possibilty that different isa emitters exist in one subgraph KernelEmitter from perf standpoint in the future.
+    // e.g. other emitters isa is avx512, while this emitter isa is avx2, and internal call is used. Internal call may use avx512 and spoil k-reg.
+    // do not care about platform w/ avx512_common but w/o avx512_core(knight landing), which is obsoleted.
+    if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)) {
+        h->sub(h->rsp, k_mask_num * k_mask_size);
+        for (size_t i = 0; i < k_mask_num; ++i) {
+            h->kmovq(h->ptr[h->rsp + i * k_mask_size], Xbyak::Opmask(static_cast<int>(i)));
+        }
+    }
+
+    // vector regs
+    // 1. Caller obligation to save vector registers as callee may use them.
+    // 2. There is an implicit assumption that the host code uses the same
+    // `isa` as the injector. Once the assumption is wrong, `vecs_count` and
+    // `vlen` should be replaced with `host_isa::vlen` and
+    // `host_isa::vecs_count`.
+    h->sub(h->rsp, get_max_vecs_count() * get_vec_length());
+    for (size_t i = 0; i < get_max_vecs_count(); ++i) {
+        push_vec(h->ptr[h->rsp + i * get_vec_length()], i);
+    }
+}
+
+void jit_emitter::internal_call_postamble() const {
+    // restore vector registers
+    for (int i = static_cast<int>(get_max_vecs_count()) - 1; i >= 0; --i) {
+        pop_vec(static_cast<size_t>(i), h->ptr[h->rsp + i * get_vec_length()]);
+    }
+    h->add(h->rsp, (get_max_vecs_count()) * get_vec_length());
+
+    // restore k reg
+    if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)) {
+        for (int i = k_mask_num - 1; i >= 0; --i) {
+            h->kmovq(Xbyak::Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
+        }
+        h->add(h->rsp, k_mask_num * k_mask_size);
+    }
+
+    // restore gpr registers
+    Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15,
+                                        h->rax, h->rbx, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp};
+    size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);
+    for (int i = n_gprs_to_save - 1; i >= 0; --i)
+        h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]);
+    h->add(h->rsp, n_gprs_to_save * gpr_size);
+}
+
+void jit_emitter::internal_call_rsp_align() const {
+    h->mov(h->rbx, h->rsp);
+    h->and_(h->rbx, 0xf);
+    h->sub(h->rsp, h->rbx);
+}
+
+void jit_emitter::internal_call_rsp_restore() const {
+    h->add(h->rsp, h->rbx);
+}
+
 }   // namespace intel_cpu
 }   // namespace ov
--- a/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.hpp
+++ b/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.hpp
@ -106,6 +106,8 @@ protected:
    mutable std::vector<size_t> aux_gpr_idxs;

    static constexpr int k_mask_size = 8;
+    static constexpr int k_mask_num = 8;
+    static constexpr int gpr_size = 8;

    Xbyak::Address table_val(std::string key, size_t key_off_val_shift = 0) const {
        auto off = table_off(key, key_off_val_shift);
@ -130,6 +132,13 @@ protected:
        }
    }

+    void internal_call_preamble() const;
+    void internal_call_postamble() const;
+    // align stack on 16-byte as ABI reqiures
+    // callee is responsible to save and restore rbx. rbx must not be changed after call callee.
+    void internal_call_rsp_align() const;
+    void internal_call_rsp_restore() const;
+
 private:
    mutable std::vector<size_t> preserved_vec_idxs;
    mutable std::vector<size_t> preserved_gpr_idxs;
--- a/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_chrono_emitters.cpp
+++ b/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_chrono_emitters.cpp
@ -0,0 +1,73 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "jit_emitter.hpp"
+#include "jit_perf_count_chrono_emitters.hpp"
+#include <cpu/x64/jit_generator.hpp>
+
+using namespace dnnl::impl;
+using namespace dnnl::impl::utils;
+using namespace dnnl::impl::cpu;
+using namespace dnnl::impl::cpu::x64;
+using namespace Xbyak;
+using namespace Xbyak::util;
+
+namespace ov {
+namespace intel_cpu {
+
+jit_perf_count_chrono_start_emitter::jit_perf_count_chrono_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
+                                            const std::shared_ptr<ov::Node>& n) : jit_emitter(host, host_isa) {
+    m_start_node = ov::as_type_ptr<snippets::op::PerfCountBegin>(n);
+}
+
+size_t jit_perf_count_chrono_start_emitter::get_inputs_num() const {
+    return 0;
+}
+
+void jit_perf_count_chrono_start_emitter::set_start_time(snippets::op::PerfCountBegin* start_node) {
+    start_node->set_start_time();
+}
+
+void jit_perf_count_chrono_start_emitter::emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const {
+    internal_call_preamble();
+
+    const auto &set_start_time_overload = static_cast<void (*)(snippets::op::PerfCountBegin*)>(set_start_time);
+    h->mov(h->rax, reinterpret_cast<size_t>(set_start_time_overload));
+    h->mov(abi_param1, reinterpret_cast<size_t>(m_start_node.get()));
+    internal_call_rsp_align();
+    h->call(h->rax);
+    internal_call_rsp_restore();
+
+    internal_call_postamble();
+}
+
+///////////////////jit_perf_count_chrono_end_emitter////////////////////////////////////
+jit_perf_count_chrono_end_emitter::jit_perf_count_chrono_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
+    const std::shared_ptr<ov::Node>& n) : jit_emitter(host, host_isa) {
+    m_end_node = ov::as_type_ptr<snippets::op::PerfCountEnd>(n);
+}
+
+size_t jit_perf_count_chrono_end_emitter::get_inputs_num() const {
+    return 0;
+}
+
+void jit_perf_count_chrono_end_emitter::set_accumulated_time(snippets::op::PerfCountEnd* end_node) {
+    end_node->set_accumulated_time();
+}
+
+void jit_perf_count_chrono_end_emitter::emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const {
+    internal_call_preamble();
+
+    const auto &set_accumulated_time_overload = static_cast<void (*)(snippets::op::PerfCountEnd*)>(set_accumulated_time);
+    h->mov(h->rax, reinterpret_cast<size_t>(set_accumulated_time_overload));
+    h->mov(abi_param1, reinterpret_cast<size_t>(m_end_node.get()));
+    internal_call_rsp_align();
+    h->call(h->rax);
+    internal_call_rsp_restore();
+
+    internal_call_postamble();
+}
+
+}   // namespace intel_cpu
+}   // namespace ov
--- a/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_chrono_emitters.hpp
+++ b/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_chrono_emitters.hpp
@ -0,0 +1,40 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "jit_emitter.hpp"
+#include <cpu/x64/jit_generator.hpp>
+
+#include "snippets/op/perf_count.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+class jit_perf_count_chrono_start_emitter : public jit_emitter {
+public:
+    jit_perf_count_chrono_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
+                            const std::shared_ptr<ov::Node>& n);
+    size_t get_inputs_num() const override;
+
+private:
+    void emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const override;
+    static void set_start_time(snippets::op::PerfCountBegin* start_node);
+    std::shared_ptr<snippets::op::PerfCountBegin> m_start_node = nullptr;
+};
+
+class jit_perf_count_chrono_end_emitter : public jit_emitter {
+public:
+    jit_perf_count_chrono_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
+                            const std::shared_ptr<ov::Node>& n);
+    size_t get_inputs_num() const override;
+
+private:
+    void emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const override;
+    static void set_accumulated_time(snippets::op::PerfCountEnd* end_node);
+    std::shared_ptr<snippets::op::PerfCountEnd> m_end_node = nullptr;
+};
+
+}   // namespace intel_cpu
+}   // namespace ov
--- a/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_rdtsc_emitters.cpp
+++ b/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_rdtsc_emitters.cpp
@ -0,0 +1,86 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "jit_emitter.hpp"
+#include "jit_perf_count_rdtsc_emitters.hpp"
+#include <cpu/x64/jit_generator.hpp>
+
+using namespace dnnl::impl;
+using namespace dnnl::impl::utils;
+using namespace dnnl::impl::cpu;
+using namespace dnnl::impl::cpu::x64;
+using namespace Xbyak;
+using namespace Xbyak::util;
+
+namespace ov {
+namespace intel_cpu {
+
+jit_perf_count_rdtsc_start_emitter::jit_perf_count_rdtsc_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
+                                            const std::shared_ptr<ov::Node>& n) : jit_emitter(host, host_isa) {
+    m_start_node = ov::as_type_ptr<ov::intel_cpu::PerfCountRdtscBegin>(n);
+}
+
+size_t jit_perf_count_rdtsc_start_emitter::get_inputs_num() const {
+    return 0;
+}
+
+void jit_perf_count_rdtsc_start_emitter::emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const {
+    h->push(h->rax);
+    h->push(h->rdx);
+
+    // The EDX register is loaded with the high-order 32 bits of the MSR and the EAX register is loaded with the low-order 32 bits.
+    h->lfence();
+    h->rdtsc();
+    h->lfence();
+    h->shl(h->rdx, 0x20);     // shift to higher half of rdx 0x20(32)
+    h->or_(h->rdx, h->rax);   // rdx has current tsc
+
+    h->mov(h->rax, reinterpret_cast<size_t>(&m_start_node->start_count));
+    h->mov(qword[h->rax], h->rdx);
+
+    h->pop(h->rdx);
+    h->pop(h->rax);
+}
+
+///////////////////jit_perf_count_rdtsc_end_emitter////////////////////////////////////
+jit_perf_count_rdtsc_end_emitter::jit_perf_count_rdtsc_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
+    const std::shared_ptr<ov::Node>& n) : jit_emitter(host, host_isa) {
+        m_end_node = ov::as_type_ptr<ov::intel_cpu::PerfCountRdtscEnd>(n);
+}
+
+size_t jit_perf_count_rdtsc_end_emitter::get_inputs_num() const {
+    return 0;
+}
+
+void jit_perf_count_rdtsc_end_emitter::emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const {
+    h->push(h->rax);
+    h->push(h->rdx);
+
+    h->lfence();
+    h->rdtsc();
+    h->lfence();
+    h->shl(h->rdx, 0x20);
+    h->or_(h->rdx, h->rax);  // rdx has current tsc
+
+    // tsc duration
+    h->mov(h->rax, reinterpret_cast<size_t>(&m_end_node->get_pc_begin()->start_count));
+    h->sub(h->rdx, qword[h->rax]);  // rdx has tsc duration
+
+    // accumulation = accumulation + tsc duration
+    h->mov(h->rax, reinterpret_cast<size_t>(&m_end_node->accumulation));
+    h->add(h->rdx, qword[h->rax]);
+    h->mov(qword[h->rax], h->rdx);
+
+    // iteration++
+    h->mov(h->rax, reinterpret_cast<size_t>(&m_end_node->iteration));
+    h->mov(h->rdx, qword[h->rax]);
+    h->add(h->rdx, 0x01);
+    h->mov(qword[h->rax], h->rdx);
+
+    h->pop(h->rdx);
+    h->pop(h->rax);
+}
+
+}   // namespace intel_cpu
+}   // namespace ov
--- a/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_rdtsc_emitters.hpp
+++ b/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_rdtsc_emitters.hpp
@ -0,0 +1,37 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "jit_emitter.hpp"
+#include <cpu/x64/jit_generator.hpp>
+#include "transformations/snippets/x64/op/perf_count_rdtsc.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+class jit_perf_count_rdtsc_start_emitter : public jit_emitter {
+public:
+    jit_perf_count_rdtsc_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
+                            const std::shared_ptr<ov::Node>& n);
+    size_t get_inputs_num() const override;
+
+private:
+    void emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const override;
+    std::shared_ptr<ov::intel_cpu::PerfCountRdtscBegin> m_start_node = nullptr;
+};
+
+class jit_perf_count_rdtsc_end_emitter : public jit_emitter {
+public:
+    jit_perf_count_rdtsc_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
+                            const std::shared_ptr<ov::Node>& n);
+    size_t get_inputs_num() const override;
+
+private:
+    void emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const override;
+    std::shared_ptr<ov::intel_cpu::PerfCountRdtscEnd> m_end_node = nullptr;
+};
+
+}   // namespace intel_cpu
+}   // namespace ov
--- a/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp
+++ b/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp
@ -26,10 +26,6 @@ using jit_generator = dnnl::impl::cpu::x64::jit_generator;
 using cpu_isa_t = dnnl::impl::cpu::x64::cpu_isa_t;
 using ExpressionPtr = ov::snippets::lowered::ExpressionPtr;

-namespace {
-constexpr size_t gpr_size = 8;
-} // namespace
-
 inline static void transform_idxs_to_regs(const std::vector<size_t>& idxs, std::vector<Reg64>& regs) {
    regs.resize(idxs.size());
    std::transform(idxs.begin(), idxs.end(), regs.begin(), [](size_t idx){return Reg64(static_cast<int>(idx));});
@ -1114,32 +1110,7 @@ void BrgemmEmitter::emit_brgemm_kernel_call(const brgemm_kernel_t *brg_kernel, c
        h->add(h->rsp, n_gprs_to_save * gpr_size);
    }

-    Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15,
-                                     h->rax, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx};
-    size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);
-
-    h->sub(h->rsp, n_gprs_to_save * gpr_size);
-    for (size_t i = 0; i < n_gprs_to_save; ++i)
-        h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]);
-
-    // caller obligation to save k-regs as callee may use them
-    size_t n_k_regs_to_save = 8;
-    h->sub(h->rsp, n_k_regs_to_save * k_mask_size);
-    for (size_t i = 0; i < n_k_regs_to_save; ++i) {
-        if (mayiuse(avx512_core))
-            h->kmovq(h->ptr[h->rsp + i * k_mask_size], Opmask(static_cast<int>(i)));
-        else
-            h->kmovw(h->ptr[h->rsp + i * k_mask_size], Opmask(static_cast<int>(i)));
-    }
-
-    // 1. Caller obligation to save vector registers as callee may use them.
-    // 2. There is an implicit assumption that the host code uses the same
-    // `isa` as the injector. Once the assumption is wrong, `vecs_count` and
-    // `vlen` should be replaced with `host_isa::vlen` and
-    // `host_isa::vecs_count`.
-    h->sub(h->rsp, get_max_vecs_count() * get_vec_length());
-    for (size_t i = 0; i < get_max_vecs_count(); ++i)
-        h->uni_vmovups(h->ptr[h->rsp + i * get_vec_length()], Zmm(i));
+    internal_call_preamble();

    // save function address in gpr to pass in call instruction
    const auto& brgemm_kernel_overload = static_cast<void (*)(const brgemm_kernel_t*,
@ -1193,38 +1164,15 @@ void BrgemmEmitter::emit_brgemm_kernel_call(const brgemm_kernel_t *brg_kernel, c
    h->mov(abi_param6, static_cast<int>(m_with_comp));
 #endif

-    // align stack on 16-byte as ABI requires
-    // note that RBX must not be changed by the callee
-    h->mov(h->rbx, h->rsp);
-    h->and_(h->rbx, 0xf);
-    h->sub(h->rsp, h->rbx);
-
+    internal_call_rsp_align();
    h->call(h->rbp);
-
-    h->add(h->rsp, h->rbx);
+    internal_call_rsp_restore();

 #ifdef _WIN32
    h->add(h->rsp, num_args_passed_on_stack * gpr_size);
 #endif
-    // restore vector registers
-    for (int i = static_cast<int>(get_max_vecs_count()) - 1; i >= 0; --i) {
-        h->uni_vmovups(Zmm(i), h->ptr[h->rsp + i * get_vec_length()]);
-    }
-    h->add(h->rsp, (get_max_vecs_count()) * get_vec_length());

-    // restore k registers
-    for (int i = n_k_regs_to_save - 1; i >= 0; --i) {
-        if (mayiuse(avx512_core))
-            h->kmovq(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
-        else
-            h->kmovw(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
-    }
-    h->add(h->rsp, n_k_regs_to_save * k_mask_size);
-
-    // restore gpr registers
-    for (int i = n_gprs_to_save - 1; i >= 0; --i)
-        h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]);
-    h->add(h->rsp, n_gprs_to_save * gpr_size);
+    internal_call_postamble();
 }

 void BrgemmEmitter::kernel_execute(const brgemm_kernel_t *brg_kernel,
@ -1358,32 +1306,7 @@ void BrgemmCopyBEmitter::emit_impl(const std::vector<size_t>& in,

 void BrgemmCopyBEmitter::emit_kernel_call(const matmul::jit_brgemm_matmul_copy_b_t* kernel, Reg64 src, Reg64 dst, Reg64 comp,
                                          size_t N, size_t K, size_t offset_in, size_t offset_out, size_t offset_comp) const {
-    Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15,
-                                     h->rax, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx};
-    size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);
-
-    h->sub(h->rsp, n_gprs_to_save * gpr_size);
-    for (size_t i = 0; i < n_gprs_to_save; ++i)
-        h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]);
-
-    // caller obligation to save k-regs as callee may use them
-    size_t n_k_regs_to_save = 8;
-    h->sub(h->rsp, n_k_regs_to_save * k_mask_size);
-    for (size_t i = 0; i < n_k_regs_to_save; ++i) {
-        if (mayiuse(avx512_core))
-            h->kmovq(h->ptr[h->rsp + i * k_mask_size], Opmask(static_cast<int>(i)));
-        else
-            h->kmovw(h->ptr[h->rsp + i * k_mask_size], Opmask(static_cast<int>(i)));
-    }
-
-    // 1. Caller obligation to save vector registers as callee may use them.
-    // 2. There is an implicit assumption that the host code uses the same
-    // `isa` as the injector. Once the assumption is wrong, `vecs_count` and
-    // `vlen` should be replaced with `host_isa::vlen` and
-    // `host_isa::vecs_count`.
-    h->sub(h->rsp, get_max_vecs_count() * get_vec_length());
-    for (size_t i = 0; i < get_max_vecs_count(); ++i)
-        h->uni_vmovups(h->ptr[h->rsp + i * get_vec_length()], Zmm(i));
+    internal_call_preamble();

    const auto data_ptr = [&](Xmm xmm, Xbyak::Reg64 reg, size_t bytes_offset) {
        h->uni_vmovq(reg, xmm);
@ -1437,38 +1360,16 @@ void BrgemmCopyBEmitter::emit_kernel_call(const matmul::jit_brgemm_matmul_copy_b
    h->mov(abi_param5, N);
    h->mov(abi_param6, K);
 #endif
-    // align stack on 16-byte as ABI requires
-    // note that RBX must not be changed by the callee
-    h->mov(h->rbx, h->rsp);
-    h->and_(h->rbx, 0xf);
-    h->sub(h->rsp, h->rbx);

+    internal_call_rsp_align();
    h->call(h->rbp);
-
-    h->add(h->rsp, h->rbx);
+    internal_call_rsp_restore();

 #ifdef _WIN32
        h->add(h->rsp, gpr_size * num_args_passed_on_stack);
 #endif
-    // restore vector registers
-    for (int i = static_cast<int>(get_max_vecs_count()) - 1; i >= 0; --i) {
-        h->uni_vmovups(Zmm(i), h->ptr[h->rsp + i * get_vec_length()]);
-    }
-    h->add(h->rsp, (get_max_vecs_count()) * get_vec_length());

-    // restore k registers
-    for (int i = n_k_regs_to_save - 1; i >= 0; --i) {
-        if (mayiuse(avx512_core))
-            h->kmovq(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
-        else
-            h->kmovw(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
-    }
-    h->add(h->rsp, n_k_regs_to_save * k_mask_size);
-
-    // restore gpr registers
-    for (int i = n_gprs_to_save - 1; i >= 0; --i)
-        h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]);
-    h->add(h->rsp, n_gprs_to_save * gpr_size);
+    internal_call_postamble();
 }

 void BrgemmCopyBEmitter::execute(matmul::jit_brgemm_matmul_copy_b_t *kernel, const void *src,
--- a/src/plugins/intel_cpu/src/extension.cpp
+++ b/src/plugins/intel_cpu/src/extension.cpp
@ -15,6 +15,7 @@
 #include "transformations/snippets/x64/op/store_convert.hpp"
 #include "transformations/snippets/x64/op/brgemm_cpu.hpp"
 #include "transformations/snippets/x64/op/brgemm_copy_b.hpp"
+#include "transformations/snippets/x64/op/perf_count_rdtsc.hpp"

 #include <ov_ops/augru_cell.hpp>
 #include <ov_ops/augru_sequence.hpp>
@ -159,12 +160,16 @@ std::map<std::string, ngraph::OpSet> Extension::getOpSets() {
        NGRAPH_OP(Subgraph, ov::snippets::op)
        NGRAPH_OP(VectorBuffer, ov::snippets::op)
        NGRAPH_OP(RankNormalization, ov::snippets::op)
+        NGRAPH_OP(PerfCountBegin, ov::snippets::op)
+        NGRAPH_OP(PerfCountEnd, ov::snippets::op)
        NGRAPH_OP_X64(LoadConvertSaturation, ov::intel_cpu)
        NGRAPH_OP_X64(LoadConvertTruncation, ov::intel_cpu)
        NGRAPH_OP_X64(StoreConvertSaturation, ov::intel_cpu)
        NGRAPH_OP_X64(StoreConvertTruncation, ov::intel_cpu)
        NGRAPH_OP_X64(BrgemmCPU, ov::intel_cpu)
        NGRAPH_OP_X64(BrgemmCopyB, ov::intel_cpu)
+        NGRAPH_OP_X64(PerfCountRdtscBegin, ov::intel_cpu)
+        NGRAPH_OP_X64(PerfCountRdtscEnd, ov::intel_cpu)
 #undef NGRAPH_OP

        return opset;
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/perf_count_rdtsc.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/perf_count_rdtsc.cpp
@ -0,0 +1,32 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "perf_count_rdtsc.hpp"
+
+using namespace ov;
+using namespace ov::intel_cpu;
+
+/////////////////////////PerfCountRdtscBegin//////////////////////
+PerfCountRdtscBegin::PerfCountRdtscBegin() : PerfCountBeginBase() {
+    validate_and_infer_types_except_PerfCountEnd();
+}
+
+std::shared_ptr<Node> PerfCountRdtscBegin::clone_with_new_inputs(const OutputVector& inputs) const {
+    return std::make_shared<PerfCountRdtscBegin>();
+}
+
+/////////////////////////PerfCountRdtscEnd//////////////////////
+PerfCountRdtscEnd::PerfCountRdtscEnd(const Output<Node>& pc_begin) : ov::snippets::op::PerfCountEndBase({pc_begin}), accumulation(0ul), iteration(0u) {
+    constructor_validate_and_infer_types();
+}
+
+std::shared_ptr<Node> PerfCountRdtscEnd::clone_with_new_inputs(const OutputVector& inputs) const {
+    return std::make_shared<PerfCountRdtscEnd>(inputs.at(0));
+}
+
+std::shared_ptr<PerfCountRdtscBegin> PerfCountRdtscEnd::get_pc_begin() {
+    const auto& pc_begin = ov::as_type_ptr<PerfCountRdtscBegin>(get_input_source_output(get_input_size() - 1).get_node_shared_ptr());
+    OPENVINO_ASSERT(pc_begin != nullptr, "PerfCountRdtscEnd last input is not connected to PerfCountRdtscBegin");
+    return  pc_begin;
+}
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/perf_count_rdtsc.hpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/perf_count_rdtsc.hpp
@ -0,0 +1,55 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/op/op.hpp"
+#include "snippets/op/perf_count.hpp"
+
+using namespace ov::snippets::op;
+
+namespace ov {
+namespace intel_cpu {
+
+/**
+ * @interface PerfCountRdtscBegin
+ * @brief Performance count start time via read rdtsc register
+ * @ingroup snippets
+ */
+class PerfCountRdtscBegin : public PerfCountBeginBase {
+public:
+    OPENVINO_OP("PerfCountRdtscBegin", "SnippetsOpset", PerfCountBeginBase);
+    PerfCountRdtscBegin();
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override;
+
+    uint64_t start_count = 0ul;
+};
+
+/**
+ * @interface PerfCountRdtscEnd
+ * @brief Performance count end time and duration
+ * @ingroup snippets
+ */
+class PerfCountRdtscEnd : public PerfCountEndBase {
+public:
+    OPENVINO_OP("PerfCountRdtscEnd", "SnippetsOpset", PerfCountEndBase);
+    PerfCountRdtscEnd(const Output<Node>& pc_begin);
+    PerfCountRdtscEnd() = default;
+    ~PerfCountRdtscEnd() {
+        uint64_t avg = iteration == 0 ? 0 : accumulation / iteration;
+        std::cout << "accumulation:" << accumulation << " iteration:" << iteration << " avg:" << avg << std::endl;
+    }
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override;
+
+    std::shared_ptr<PerfCountRdtscBegin> get_pc_begin();
+    // in each call, PerfCountRdtscBegin get start_count.
+    // in each call, PerfCountRdtscEnd get end_count, then total_duration += end_count - start_count, and iteration++.
+    // in destructor of PerfCountRdtscEnd, output the perf info
+    // accumulation is cycle count
+    uint64_t accumulation = 0ul;
+    uint32_t iteration = 0u;
+};
+
+} // namespace intel_cpu
+} // namespace ov
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/shape_inference.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/shape_inference.cpp
@ -9,6 +9,7 @@
 #include "op/fused_mul_add.hpp"
 #include "op/load_convert.hpp"
 #include "op/store_convert.hpp"
+#include "op/perf_count_rdtsc.hpp"
 #include "transformations/cpu_opset/common/op/swish_cpu.hpp"

 namespace ov {
@ -38,6 +39,8 @@ const CPUShapeInferSnippetsFactory::TRegistry CPUShapeInferSnippetsFactory::spec
        SHAPE_INFER_PREDEFINED(ov::intel_cpu::LoadConvertTruncation, PassThroughShapeInfer),
        SHAPE_INFER_PREDEFINED(ov::intel_cpu::StoreConvertSaturation, PassThroughShapeInfer),
        SHAPE_INFER_PREDEFINED(ov::intel_cpu::StoreConvertTruncation, PassThroughShapeInfer),
+        SHAPE_INFER_PREDEFINED(ov::intel_cpu::PerfCountRdtscBegin, EmptyShapeInfer),
+        SHAPE_INFER_PREDEFINED(ov::intel_cpu::PerfCountRdtscEnd, EmptyShapeInfer),
        SHAPE_INFER_OP_SPECIFIC_EXTERNAL(ov::intel_cpu::BrgemmCPU, BrgemmShapeInfer),
        //
        SHAPE_INFER_OP_SPECIFIC(ov::intel_cpu::BrgemmCopyB),