diff --git a/src/common/snippets/include/snippets/generator.hpp b/src/common/snippets/include/snippets/generator.hpp index 32b44b9e6ab..1647ccf1e77 100644 --- a/src/common/snippets/include/snippets/generator.hpp +++ b/src/common/snippets/include/snippets/generator.hpp @@ -29,7 +29,7 @@ class Generator; class LoweringResult { friend class Generator; // Some emitters rely on other precompiled kernels. - // We need to keep the pointers to such emitters alive, so the kernels would still be accessible at runtime. + // We need to keep the pointers to such emitters alive, so the kernels or nodes would still be accessible at runtime. std::vector> m_saved_emitters{}; public: diff --git a/src/common/snippets/include/snippets/lowered/linear_ir.hpp b/src/common/snippets/include/snippets/lowered/linear_ir.hpp index 22ffc7ff36d..55722b4c03d 100644 --- a/src/common/snippets/include/snippets/lowered/linear_ir.hpp +++ b/src/common/snippets/include/snippets/lowered/linear_ir.hpp @@ -14,6 +14,18 @@ namespace ov { namespace snippets { namespace lowered { +// Snippets performance count mode +// Disabled - default, w/o perf count for snippets +// Chrono - perf count with chrono call. This is a universal method, and support multi-thread case to output perf count data for each thread. +// BackendSpecific - perf count provided by backend. This is for device specific requirment. +// For example, in sake of more light overhead and more accurate result, x86 CPU specific mode via read RDTSC register is implemented, +// which take ~50ns, while Chrono mode take 260ns for a pair of perf count start and perf count end execution, on ICX. This mode only support single thread. +enum PerfCountMode { + Disabled, + Chrono, + BackendSpecific, +}; + class Config { public: // True if the lowered Emitters need to be accessed during runtime. Normally they're destroyed after code emission. @@ -21,6 +33,7 @@ public: // True if we should check runtime info for nodes to call specific needed transformations bool m_need_fill_tail_register = false; size_t m_loop_depth = 1; + PerfCountMode perf_count_mode = PerfCountMode::Disabled; // Some Subgraphs doesn't support domain optimization due to operations' semantics bool m_enable_domain_optimization = false; // Minimal advised work amount for parallel execution. diff --git a/src/common/snippets/include/snippets/lowered/pass/insert_perf_count.hpp b/src/common/snippets/include/snippets/lowered/pass/insert_perf_count.hpp new file mode 100644 index 00000000000..8478a0d931f --- /dev/null +++ b/src/common/snippets/include/snippets/lowered/pass/insert_perf_count.hpp @@ -0,0 +1,33 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "pass.hpp" + +#include "snippets/op/perf_count.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { + +/** + * @interface InsertPerfCount + * @brief Insert PerfCountBegin node after last parameter and insert PerfCountEnd node before first result. + * This is a illustration transformation to enable perf count in snippets. + * Developers could modify this to insert perf count pairs around interested sequence of nodes. + * @ingroup snippets + */ +class InsertPerfCount: public Pass { +public: + OPENVINO_RTTI("InsertPerfCount", "Pass") + InsertPerfCount() = default; + bool run(LinearIR& linear_ir) override; +}; + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/include/snippets/op/perf_count.hpp b/src/common/snippets/include/snippets/op/perf_count.hpp new file mode 100644 index 00000000000..be7eccfc5b4 --- /dev/null +++ b/src/common/snippets/include/snippets/op/perf_count.hpp @@ -0,0 +1,93 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/op/op.hpp" +#include "openvino/runtime/threading/thread_local.hpp" + +namespace ov { +namespace snippets { +namespace op { + +/** + * @interface PerfCountBeginBase + * @brief Base class for PerfCountBegin and PerfCountRdtscBegin(cpu) + * @ingroup snippets + */ +class PerfCountBeginBase : public ov::op::Op { +public: + OPENVINO_OP("PerfCountBeginBase", "SnippetsOpset"); + PerfCountBeginBase(const std::vector>& args); + PerfCountBeginBase() = default; + + void validate_and_infer_types() override; + bool visit_attributes(AttributeVisitor& visitor) override; + +protected: + void validate_and_infer_types_except_PerfCountEnd(); +}; + +/** + * @interface PerfCountEndBase + * @brief Base class for PerfCountEnd and PerfCountRdtscEnd + * @ingroup snippets + */ +class PerfCountEndBase : public ov::op::Op { +public: + OPENVINO_OP("PerfCountEndBase", "SnippetsOpset"); + PerfCountEndBase(const std::vector>& args); + PerfCountEndBase() = default; + + void validate_and_infer_types() override; + bool visit_attributes(AttributeVisitor& visitor) override; +}; + +/** + * @interface PerfCountBegin + * @brief Performance count start time with chrono call + * @ingroup snippets + */ +class PerfCountBegin : public PerfCountBeginBase { +public: + OPENVINO_OP("PerfCountBegin", "SnippetsOpset", PerfCountBeginBase); + PerfCountBegin(); + + std::shared_ptr clone_with_new_inputs(const OutputVector& inputs) const override; + + void set_start_time(); + std::chrono::high_resolution_clock::time_point& get_start_time(); + +private: + ov::threading::ThreadLocal start_time_stamp; +}; + +/** + * @interface PerfCountEnd + * @brief Performance count end time and duration with chrono call + * @ingroup snippets + */ +class PerfCountEnd : public PerfCountEndBase { +public: + OPENVINO_OP("PerfCountEnd", "SnippetsOpset", PerfCountEndBase); + PerfCountEnd(const Output& pc_begin); + PerfCountEnd() = default; + ~PerfCountEnd() { + output_perf_count(); + } + void output_perf_count(); + std::shared_ptr clone_with_new_inputs(const OutputVector& inputs) const override; + + void init_pc_begin(); + void set_accumulated_time(); + +private: + ov::threading::ThreadLocal accumulation; + ov::threading::ThreadLocal iteration; + std::shared_ptr m_pc_begin = nullptr; +}; + +} // namespace op +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/include/snippets/snippets_isa.hpp b/src/common/snippets/include/snippets/snippets_isa.hpp index ba85ae68eeb..b2c6d46b722 100644 --- a/src/common/snippets/include/snippets/snippets_isa.hpp +++ b/src/common/snippets/include/snippets/snippets_isa.hpp @@ -25,6 +25,7 @@ #include "op/brgemm.hpp" #include "op/vector_buffer.hpp" #include "op/rank_normalization.hpp" +#include "op/perf_count.hpp" namespace ov { namespace snippets { diff --git a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp index 351770bdab7..163329d6394 100644 --- a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp +++ b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp @@ -24,6 +24,9 @@ OV_OP(Scalar, ov::snippets::op) OV_OP(Nop, ov::snippets::op) OV_OP(RankNormalization, ov::snippets::op) +OV_OP(PerfCountBegin, ov::snippets::op) +OV_OP(PerfCountEnd, ov::snippets::op) + // Layout-oblivious from opset1 // opset completeness diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp index cede4c4a6e5..0dacee4878d 100644 --- a/src/common/snippets/src/generator.cpp +++ b/src/common/snippets/src/generator.cpp @@ -44,7 +44,8 @@ void Generator::generate(lowered::LinearIR& linear_ir, LoweringResult& result, c } OV_ITT_TASK_NEXT(GENERATE, "::GetSnippet") - // Note: some emitters use precompiled kernels. They need to be saved, so the kernels are accessible at runtime. + // 1. some emitters use precompiled kernels. They need to be saved, so the kernels are accessible at runtime. + // 2. perf count node as field of emitter should be alive at runtime. if (linear_ir.get_config().m_save_expressions) { for (const auto& expr : linear_ir) { const auto& emitter = expr->get_emitter(); @@ -66,7 +67,9 @@ Generator::opRegType Generator::get_op_reg_type(const std::shared_ptr& op) std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op) || - std::dynamic_pointer_cast(op)) + std::dynamic_pointer_cast(op) || + std::dynamic_pointer_cast(op) || + std::dynamic_pointer_cast(op)) return gpr2gpr; else if (std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op)) diff --git a/src/common/snippets/src/lowered/pass/insert_perf_count.cpp b/src/common/snippets/src/lowered/pass/insert_perf_count.cpp new file mode 100644 index 00000000000..2ed02c9010d --- /dev/null +++ b/src/common/snippets/src/lowered/pass/insert_perf_count.cpp @@ -0,0 +1,62 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/lowered/pass/insert_perf_count.hpp" +#include "snippets/lowered/linear_ir.hpp" +#include "snippets/snippets_isa.hpp" +#include "snippets/itt.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { + +bool InsertPerfCount::run(LinearIR& linear_ir) { + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InsertPerfCount") + if (linear_ir.empty()) + return false; + + auto is_parameter = [](const std::shared_ptr& node) { + return ov::is_type(node); + }; + auto is_result = [](const std::shared_ptr& node) { + return ov::is_type(node); + }; + + // mark perf_count_begin and perf_count_end position + auto perf_count_begin_pos = linear_ir.cbegin(); + auto perf_count_end_pos = perf_count_begin_pos; + bool first_result_marked = false; + for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); expr_it++) { + const auto expr = *expr_it; + const auto& node = expr->get_node(); + if (is_parameter(node)) + perf_count_begin_pos = expr_it; + + if (is_result(node) && !first_result_marked) { + perf_count_end_pos = expr_it; + first_result_marked = true; + } + } + + // insert perf_count_begin after last parameter + // linear_ir.insert has insert before behavior, need move to next. + perf_count_begin_pos = std::next(perf_count_begin_pos); + const auto& perf_count_begin = std::make_shared(); + const auto& perf_count_begin_expr = linear_ir.create_expression(perf_count_begin, std::vector{}); + linear_ir.insert(perf_count_begin_pos, perf_count_begin_expr); + + // insert perf_count_end before first result + const auto& perf_count_end = std::make_shared(perf_count_begin->output(0)); + perf_count_end->set_friendly_name("last_parameter_to_first_result"); + const auto& perf_count_end_expr = linear_ir.create_expression(perf_count_end, std::vector{}); + linear_ir.insert(perf_count_end_pos, perf_count_end_expr); + + return true; +} + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/src/op/perf_count.cpp b/src/common/snippets/src/op/perf_count.cpp new file mode 100644 index 00000000000..66061753373 --- /dev/null +++ b/src/common/snippets/src/op/perf_count.cpp @@ -0,0 +1,115 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/op/perf_count.hpp" + +namespace ov { +namespace snippets { +namespace op { + +/////////////////PerfCountBeginBase///////////////// +PerfCountBeginBase::PerfCountBeginBase(const std::vector>& args) : Op() {} + +void PerfCountBeginBase::validate_and_infer_types() { + validate_and_infer_types_except_PerfCountEnd(); + OPENVINO_ASSERT(get_output_size() == 1, "PerfCountBegin must have only one output"); + const auto& last_output_inputs = get_output_target_inputs(0); + OPENVINO_ASSERT(last_output_inputs.size() == 1, "PerfCountBegin must have exactly one input attached to the last output"); + const auto& pc_end = ov::as_type_ptr(last_output_inputs.begin()->get_node()->shared_from_this()); + OPENVINO_ASSERT(pc_end != nullptr, "PerfCountBegin must have PerfCountEnd connected to its last output"); +} + +bool PerfCountBeginBase::visit_attributes(AttributeVisitor &visitor) { + return true; +} + +void PerfCountBeginBase::validate_and_infer_types_except_PerfCountEnd() { + NODE_VALIDATION_CHECK(this, get_input_size() == 0, "PerfCountBegin doesn't expect any inputs"); + set_output_type(0, element::f32, {}); +} + +//////////////////PerfCountEndBase///////////////// +PerfCountEndBase::PerfCountEndBase(const std::vector> &args) : Op(args) {} + +void PerfCountEndBase::validate_and_infer_types() { + NODE_VALIDATION_CHECK(this, get_input_size() == 1, "PerfCountEndBase must have one input"); + const auto& pc_begin = ov::as_type_ptr(get_input_node_shared_ptr(0)); + NODE_VALIDATION_CHECK(this, pc_begin != nullptr, "PerfCountEndBase must have PerfCountBeginBase as the last argument"); + set_output_type(0, element::f32, {}); +} + +bool PerfCountEndBase::visit_attributes(AttributeVisitor &visitor) { + return true; +} + +/////////////////PerfCountBegin///////////////// +PerfCountBegin::PerfCountBegin() : PerfCountBeginBase() { + validate_and_infer_types_except_PerfCountEnd(); +} + +std::shared_ptr PerfCountBegin::clone_with_new_inputs(const OutputVector& inputs) const { + return std::make_shared(); +} + +std::chrono::high_resolution_clock::time_point& PerfCountBegin::get_start_time() { + return start_time_stamp.local(); +} + +void PerfCountBegin::set_start_time() { + start_time_stamp.local() = std::chrono::high_resolution_clock::now(); +} + +//////////////////PerfCountEnd/////////////// +PerfCountEnd::PerfCountEnd(const Output& pc_begin) : PerfCountEndBase({pc_begin}), accumulation(0ul), iteration(0u) { + constructor_validate_and_infer_types(); + init_pc_begin(); +} + +std::shared_ptr PerfCountEnd::clone_with_new_inputs(const OutputVector& inputs) const { + return std::make_shared(inputs.at(0)); +} + +void PerfCountEnd::set_accumulated_time() { + auto current_time = std::chrono::high_resolution_clock::now(); + auto& start_time = m_pc_begin->get_start_time(); + accumulation.local() += std::chrono::duration_cast(current_time - start_time).count(); + iteration.local()++; +} + +void PerfCountEnd::init_pc_begin() { + m_pc_begin = ov::as_type_ptr(get_input_source_output(get_input_size() - 1).get_node_shared_ptr()); + NODE_VALIDATION_CHECK(this, m_pc_begin != nullptr, "PerfCountEnd last input is not connected to PerfCountBegin"); +} + +void PerfCountEnd::output_perf_count() { + OPENVINO_ASSERT(accumulation.size() == iteration.size(), "accumulation size should be the same as iteration size in perf_count_end node."); + auto iterator_iter = iteration.begin(); + auto iterator_acc = accumulation.begin(); + int t_num = 0; + uint64_t avg_max = 0; + std::cout << "Perf count data in perfCountEnd node with name " << get_friendly_name() << " is:"<< std::endl; + for (; iterator_iter != iteration.end(); ++iterator_iter, ++iterator_acc) { + const auto iter = *iterator_iter; + const auto acc = *iterator_acc; + uint64_t avg = iter == 0 ? 0 : acc / iter; + if (avg > avg_max) + avg_max = avg; + std::cout << "accumulated time:" << acc << "ns, iteration:" << iter << " avg time:" << avg << "ns"<< " on thread:" << t_num << std::endl; + t_num++; + } + + // max time of all threads: combine for reduce max + auto BinaryFunc = [](const uint64_t& a, const uint64_t& b) { + return a >= b ? a : b; + }; + // max accumulation + uint64_t acc_max = accumulation.combine(BinaryFunc); + std::cout << "max accumulated time:" << acc_max << "ns" << std::endl; + // max avg + std::cout << "max avg time:" << avg_max << "ns" << std::endl; +} + +} // namespace op +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index 02e87459056..adeed6e26b4 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -42,6 +42,7 @@ #include "snippets/lowered/pass/validate_loops.hpp" #include "snippets/lowered/pass/insert_loops.hpp" #include "snippets/lowered/pass/optimize_domain.hpp" +#include "snippets/lowered/pass/insert_perf_count.hpp" #include "transformations/utils/utils.hpp" @@ -349,7 +350,8 @@ VectorDims Subgraph::infer_master_shape() { std::shared_ptr Subgraph::convert_body_to_linear_ir(const std::shared_ptr& shape_infer_factory) { lowered::Config lowering_config; - lowering_config.m_save_expressions = config.m_has_domain_sensitive_ops; + lowering_config.m_save_expressions = config.m_has_domain_sensitive_ops || + (lowering_config.perf_count_mode != lowered::PerfCountMode::Disabled); lowering_config.m_need_fill_tail_register = config.m_has_domain_sensitive_ops; lowering_config.m_loop_depth = tileRank; lowering_config.m_enable_domain_optimization = !config.m_has_domain_sensitive_ops; @@ -487,6 +489,10 @@ snippets::Schedule Subgraph::generate_from_linear_ir(const lowered::pass::PassPi auto linear_ir {*m_linear_ir->clone()}; LoweringResult lowering_result; control_flow_transformations(linear_ir, lowering_result, backend_passes_pre_common, backend_passes_post_common); + if (linear_ir.get_config().perf_count_mode == lowered::PerfCountMode::Chrono) { + lowered::pass::InsertPerfCount perf_count_pass; + perf_count_pass.run(linear_ir); + } m_generator->generate(linear_ir, lowering_result, compile_params); VectorDims parallel_exec_domain = linear_ir.get_master_shape(); diff --git a/src/common/snippets/src/shape_inference/shape_inference.cpp b/src/common/snippets/src/shape_inference/shape_inference.cpp index 0b9117d05d0..f2c6be9ae0b 100644 --- a/src/common/snippets/src/shape_inference/shape_inference.cpp +++ b/src/common/snippets/src/shape_inference/shape_inference.cpp @@ -55,6 +55,8 @@ const IShapeInferSnippetsFactory::TRegistry IShapeInferSnippetsFactory::registry SHAPE_INFER_PREDEFINED(op::Scalar, SingleElementShapeInfer), SHAPE_INFER_PREDEFINED(op::VectorBuffer, SingleElementShapeInfer), SHAPE_INFER_PREDEFINED(op::LoopEnd, EmptyShapeInfer), + SHAPE_INFER_PREDEFINED(op::PerfCountBegin, EmptyShapeInfer), + SHAPE_INFER_PREDEFINED(op::PerfCountEnd, EmptyShapeInfer), SHAPE_INFER_PREDEFINED(op::Kernel, EmptyShapeInfer), SHAPE_INFER_PREDEFINED(op::Nop, EmptyShapeInfer), SHAPE_INFER_OP_SPECIFIC_EXTERNAL(opset1::Select, SelectShapeInfer), diff --git a/src/common/snippets/tests/src/lowering_utils.cpp b/src/common/snippets/tests/src/lowering_utils.cpp index 5d49d38a6af..0fa490353e4 100644 --- a/src/common/snippets/tests/src/lowering_utils.cpp +++ b/src/common/snippets/tests/src/lowering_utils.cpp @@ -41,6 +41,8 @@ DummyTargetMachine::DummyTargetMachine(const std::vector& jitters[ov::snippets::op::Kernel::get_type_info_static()] = dummy_functor; jitters[ov::snippets::op::LoopBegin::get_type_info_static()] = dummy_functor; jitters[ov::snippets::op::LoopEnd::get_type_info_static()] = dummy_functor; + jitters[ov::snippets::op::PerfCountBegin::get_type_info_static()] = dummy_functor; + jitters[ov::snippets::op::PerfCountEnd::get_type_info_static()] = dummy_functor; jitters[ov::snippets::op::Brgemm::get_type_info_static()] = dummy_functor; jitters[ov::snippets::op::Buffer::get_type_info_static()] = dummy_functor; jitters[ov::snippets::op::VectorBuffer::get_type_info_static()] = dummy_functor; diff --git a/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp b/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp index 679d9518baa..a92c312ee72 100644 --- a/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp +++ b/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp @@ -116,6 +116,20 @@ struct ThreadLocal { auto end() const -> Iterator const { return {_map.end()}; } + + // CombineFunc has signature T(T,T) or T(const T&, const T&) + template + T combine(CombineFunc f_combine) { + if (begin() != end()) { + auto ci = begin(); + T my_result = *ci; + while (++ci != end()) + my_result = f_combine(my_result, *ci); + return my_result; + } else { + return _create(); + } + } }; #endif diff --git a/src/plugins/intel_cpu/src/emitters/x64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/x64/cpu_generator.cpp index c1d3e9d1bd1..a765ac7c60d 100644 --- a/src/plugins/intel_cpu/src/emitters/x64/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/x64/cpu_generator.cpp @@ -14,12 +14,15 @@ #include "jit_dnnl_emitters.hpp" #include "jit_dnnl_ext_emitters.hpp" #include "jit_conversion_emitters.hpp" +#include "jit_perf_count_chrono_emitters.hpp" +#include "jit_perf_count_rdtsc_emitters.hpp" #include "transformations/snippets/x64/op/load_convert.hpp" #include "transformations/snippets/x64/op/store_convert.hpp" #include "transformations/snippets/x64/op/fused_mul_add.hpp" #include "transformations/snippets/x64/op/brgemm_copy_b.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp" +#include "transformations/snippets/x64/op/perf_count_rdtsc.hpp" #include "transformations/cpu_opset/common/op/swish_cpu.hpp" #include "transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.hpp" @@ -157,6 +160,11 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho jitters[snippets::op::LoopEnd::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(LoopEndEmitter); jitters[intel_cpu::BrgemmCPU::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(BrgemmEmitter); jitters[intel_cpu::BrgemmCopyB::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(BrgemmCopyBEmitter); + + jitters[snippets::op::PerfCountBegin::get_type_info_static()] = CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_chrono_start_emitter); + jitters[snippets::op::PerfCountEnd::get_type_info_static()] = CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_chrono_end_emitter); + jitters[ov::intel_cpu::PerfCountRdtscBegin::get_type_info_static()] = CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_rdtsc_start_emitter); + jitters[ov::intel_cpu::PerfCountRdtscEnd::get_type_info_static()] = CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_rdtsc_end_emitter); } size_t intel_cpu::CPUTargetMachine::get_lanes() const { @@ -224,6 +232,10 @@ snippets::Generator::opRegType intel_cpu::CPUGenerator::get_specific_op_reg_type } bool intel_cpu::CPUGenerator::uses_precompiled_kernel(const std::shared_ptr& e) const { return std::dynamic_pointer_cast(e) || - std::dynamic_pointer_cast(e); + std::dynamic_pointer_cast(e) || + std::dynamic_pointer_cast(e) || + std::dynamic_pointer_cast(e) || + std::dynamic_pointer_cast(e) || + std::dynamic_pointer_cast(e); } -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.cpp b/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.cpp index f9885daa0f6..dbaafdde812 100644 --- a/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.cpp @@ -213,5 +213,73 @@ void jit_emitter::emit_code(const std::vector &in_idxs, const std::vecto emitter_postamble(); } +void jit_emitter::internal_call_preamble() const { + // gprs + Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15, + h->rax, h->rbx, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp}; + size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]); + + h->sub(h->rsp, n_gprs_to_save * gpr_size); + for (size_t i = 0; i < n_gprs_to_save; ++i) + h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]); + + // mask regs + // need preserve based on cpu capability, instead of host isa. + // in case there are possibilty that different isa emitters exist in one subgraph KernelEmitter from perf standpoint in the future. + // e.g. other emitters isa is avx512, while this emitter isa is avx2, and internal call is used. Internal call may use avx512 and spoil k-reg. + // do not care about platform w/ avx512_common but w/o avx512_core(knight landing), which is obsoleted. + if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)) { + h->sub(h->rsp, k_mask_num * k_mask_size); + for (size_t i = 0; i < k_mask_num; ++i) { + h->kmovq(h->ptr[h->rsp + i * k_mask_size], Xbyak::Opmask(static_cast(i))); + } + } + + // vector regs + // 1. Caller obligation to save vector registers as callee may use them. + // 2. There is an implicit assumption that the host code uses the same + // `isa` as the injector. Once the assumption is wrong, `vecs_count` and + // `vlen` should be replaced with `host_isa::vlen` and + // `host_isa::vecs_count`. + h->sub(h->rsp, get_max_vecs_count() * get_vec_length()); + for (size_t i = 0; i < get_max_vecs_count(); ++i) { + push_vec(h->ptr[h->rsp + i * get_vec_length()], i); + } +} + +void jit_emitter::internal_call_postamble() const { + // restore vector registers + for (int i = static_cast(get_max_vecs_count()) - 1; i >= 0; --i) { + pop_vec(static_cast(i), h->ptr[h->rsp + i * get_vec_length()]); + } + h->add(h->rsp, (get_max_vecs_count()) * get_vec_length()); + + // restore k reg + if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)) { + for (int i = k_mask_num - 1; i >= 0; --i) { + h->kmovq(Xbyak::Opmask(i), h->ptr[h->rsp + i * k_mask_size]); + } + h->add(h->rsp, k_mask_num * k_mask_size); + } + + // restore gpr registers + Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15, + h->rax, h->rbx, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp}; + size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]); + for (int i = n_gprs_to_save - 1; i >= 0; --i) + h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]); + h->add(h->rsp, n_gprs_to_save * gpr_size); +} + +void jit_emitter::internal_call_rsp_align() const { + h->mov(h->rbx, h->rsp); + h->and_(h->rbx, 0xf); + h->sub(h->rsp, h->rbx); +} + +void jit_emitter::internal_call_rsp_restore() const { + h->add(h->rsp, h->rbx); +} + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.hpp b/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.hpp index c10adb19f9e..66f681265b9 100644 --- a/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.hpp @@ -106,6 +106,8 @@ protected: mutable std::vector aux_gpr_idxs; static constexpr int k_mask_size = 8; + static constexpr int k_mask_num = 8; + static constexpr int gpr_size = 8; Xbyak::Address table_val(std::string key, size_t key_off_val_shift = 0) const { auto off = table_off(key, key_off_val_shift); @@ -130,6 +132,13 @@ protected: } } + void internal_call_preamble() const; + void internal_call_postamble() const; + // align stack on 16-byte as ABI reqiures + // callee is responsible to save and restore rbx. rbx must not be changed after call callee. + void internal_call_rsp_align() const; + void internal_call_rsp_restore() const; + private: mutable std::vector preserved_vec_idxs; mutable std::vector preserved_gpr_idxs; diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_chrono_emitters.cpp b/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_chrono_emitters.cpp new file mode 100644 index 00000000000..a94535dfbcb --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_chrono_emitters.cpp @@ -0,0 +1,73 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "jit_emitter.hpp" +#include "jit_perf_count_chrono_emitters.hpp" +#include + +using namespace dnnl::impl; +using namespace dnnl::impl::utils; +using namespace dnnl::impl::cpu; +using namespace dnnl::impl::cpu::x64; +using namespace Xbyak; +using namespace Xbyak::util; + +namespace ov { +namespace intel_cpu { + +jit_perf_count_chrono_start_emitter::jit_perf_count_chrono_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n) : jit_emitter(host, host_isa) { + m_start_node = ov::as_type_ptr(n); +} + +size_t jit_perf_count_chrono_start_emitter::get_inputs_num() const { + return 0; +} + +void jit_perf_count_chrono_start_emitter::set_start_time(snippets::op::PerfCountBegin* start_node) { + start_node->set_start_time(); +} + +void jit_perf_count_chrono_start_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const { + internal_call_preamble(); + + const auto &set_start_time_overload = static_cast(set_start_time); + h->mov(h->rax, reinterpret_cast(set_start_time_overload)); + h->mov(abi_param1, reinterpret_cast(m_start_node.get())); + internal_call_rsp_align(); + h->call(h->rax); + internal_call_rsp_restore(); + + internal_call_postamble(); +} + +///////////////////jit_perf_count_chrono_end_emitter//////////////////////////////////// +jit_perf_count_chrono_end_emitter::jit_perf_count_chrono_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n) : jit_emitter(host, host_isa) { + m_end_node = ov::as_type_ptr(n); +} + +size_t jit_perf_count_chrono_end_emitter::get_inputs_num() const { + return 0; +} + +void jit_perf_count_chrono_end_emitter::set_accumulated_time(snippets::op::PerfCountEnd* end_node) { + end_node->set_accumulated_time(); +} + +void jit_perf_count_chrono_end_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const { + internal_call_preamble(); + + const auto &set_accumulated_time_overload = static_cast(set_accumulated_time); + h->mov(h->rax, reinterpret_cast(set_accumulated_time_overload)); + h->mov(abi_param1, reinterpret_cast(m_end_node.get())); + internal_call_rsp_align(); + h->call(h->rax); + internal_call_rsp_restore(); + + internal_call_postamble(); +} + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_chrono_emitters.hpp b/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_chrono_emitters.hpp new file mode 100644 index 00000000000..763ac995ffe --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_chrono_emitters.hpp @@ -0,0 +1,40 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "jit_emitter.hpp" +#include + +#include "snippets/op/perf_count.hpp" + +namespace ov { +namespace intel_cpu { + +class jit_perf_count_chrono_start_emitter : public jit_emitter { +public: + jit_perf_count_chrono_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n); + size_t get_inputs_num() const override; + +private: + void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const override; + static void set_start_time(snippets::op::PerfCountBegin* start_node); + std::shared_ptr m_start_node = nullptr; +}; + +class jit_perf_count_chrono_end_emitter : public jit_emitter { +public: + jit_perf_count_chrono_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n); + size_t get_inputs_num() const override; + +private: + void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const override; + static void set_accumulated_time(snippets::op::PerfCountEnd* end_node); + std::shared_ptr m_end_node = nullptr; +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_rdtsc_emitters.cpp b/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_rdtsc_emitters.cpp new file mode 100644 index 00000000000..7f1ccda3aca --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_rdtsc_emitters.cpp @@ -0,0 +1,86 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "jit_emitter.hpp" +#include "jit_perf_count_rdtsc_emitters.hpp" +#include + +using namespace dnnl::impl; +using namespace dnnl::impl::utils; +using namespace dnnl::impl::cpu; +using namespace dnnl::impl::cpu::x64; +using namespace Xbyak; +using namespace Xbyak::util; + +namespace ov { +namespace intel_cpu { + +jit_perf_count_rdtsc_start_emitter::jit_perf_count_rdtsc_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n) : jit_emitter(host, host_isa) { + m_start_node = ov::as_type_ptr(n); +} + +size_t jit_perf_count_rdtsc_start_emitter::get_inputs_num() const { + return 0; +} + +void jit_perf_count_rdtsc_start_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const { + h->push(h->rax); + h->push(h->rdx); + + // The EDX register is loaded with the high-order 32 bits of the MSR and the EAX register is loaded with the low-order 32 bits. + h->lfence(); + h->rdtsc(); + h->lfence(); + h->shl(h->rdx, 0x20); // shift to higher half of rdx 0x20(32) + h->or_(h->rdx, h->rax); // rdx has current tsc + + h->mov(h->rax, reinterpret_cast(&m_start_node->start_count)); + h->mov(qword[h->rax], h->rdx); + + h->pop(h->rdx); + h->pop(h->rax); +} + +///////////////////jit_perf_count_rdtsc_end_emitter//////////////////////////////////// +jit_perf_count_rdtsc_end_emitter::jit_perf_count_rdtsc_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n) : jit_emitter(host, host_isa) { + m_end_node = ov::as_type_ptr(n); +} + +size_t jit_perf_count_rdtsc_end_emitter::get_inputs_num() const { + return 0; +} + +void jit_perf_count_rdtsc_end_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const { + h->push(h->rax); + h->push(h->rdx); + + h->lfence(); + h->rdtsc(); + h->lfence(); + h->shl(h->rdx, 0x20); + h->or_(h->rdx, h->rax); // rdx has current tsc + + // tsc duration + h->mov(h->rax, reinterpret_cast(&m_end_node->get_pc_begin()->start_count)); + h->sub(h->rdx, qword[h->rax]); // rdx has tsc duration + + // accumulation = accumulation + tsc duration + h->mov(h->rax, reinterpret_cast(&m_end_node->accumulation)); + h->add(h->rdx, qword[h->rax]); + h->mov(qword[h->rax], h->rdx); + + // iteration++ + h->mov(h->rax, reinterpret_cast(&m_end_node->iteration)); + h->mov(h->rdx, qword[h->rax]); + h->add(h->rdx, 0x01); + h->mov(qword[h->rax], h->rdx); + + h->pop(h->rdx); + h->pop(h->rax); +} + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_rdtsc_emitters.hpp b/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_rdtsc_emitters.hpp new file mode 100644 index 00000000000..c6314adc72a --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_rdtsc_emitters.hpp @@ -0,0 +1,37 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "jit_emitter.hpp" +#include +#include "transformations/snippets/x64/op/perf_count_rdtsc.hpp" + +namespace ov { +namespace intel_cpu { + +class jit_perf_count_rdtsc_start_emitter : public jit_emitter { +public: + jit_perf_count_rdtsc_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n); + size_t get_inputs_num() const override; + +private: + void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const override; + std::shared_ptr m_start_node = nullptr; +}; + +class jit_perf_count_rdtsc_end_emitter : public jit_emitter { +public: + jit_perf_count_rdtsc_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n); + size_t get_inputs_num() const override; + +private: + void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const override; + std::shared_ptr m_end_node = nullptr; +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp index 40e49a7b158..b87b265a03f 100644 --- a/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp @@ -26,10 +26,6 @@ using jit_generator = dnnl::impl::cpu::x64::jit_generator; using cpu_isa_t = dnnl::impl::cpu::x64::cpu_isa_t; using ExpressionPtr = ov::snippets::lowered::ExpressionPtr; -namespace { -constexpr size_t gpr_size = 8; -} // namespace - inline static void transform_idxs_to_regs(const std::vector& idxs, std::vector& regs) { regs.resize(idxs.size()); std::transform(idxs.begin(), idxs.end(), regs.begin(), [](size_t idx){return Reg64(static_cast(idx));}); @@ -1114,32 +1110,7 @@ void BrgemmEmitter::emit_brgemm_kernel_call(const brgemm_kernel_t *brg_kernel, c h->add(h->rsp, n_gprs_to_save * gpr_size); } - Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15, - h->rax, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx}; - size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]); - - h->sub(h->rsp, n_gprs_to_save * gpr_size); - for (size_t i = 0; i < n_gprs_to_save; ++i) - h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]); - - // caller obligation to save k-regs as callee may use them - size_t n_k_regs_to_save = 8; - h->sub(h->rsp, n_k_regs_to_save * k_mask_size); - for (size_t i = 0; i < n_k_regs_to_save; ++i) { - if (mayiuse(avx512_core)) - h->kmovq(h->ptr[h->rsp + i * k_mask_size], Opmask(static_cast(i))); - else - h->kmovw(h->ptr[h->rsp + i * k_mask_size], Opmask(static_cast(i))); - } - - // 1. Caller obligation to save vector registers as callee may use them. - // 2. There is an implicit assumption that the host code uses the same - // `isa` as the injector. Once the assumption is wrong, `vecs_count` and - // `vlen` should be replaced with `host_isa::vlen` and - // `host_isa::vecs_count`. - h->sub(h->rsp, get_max_vecs_count() * get_vec_length()); - for (size_t i = 0; i < get_max_vecs_count(); ++i) - h->uni_vmovups(h->ptr[h->rsp + i * get_vec_length()], Zmm(i)); + internal_call_preamble(); // save function address in gpr to pass in call instruction const auto& brgemm_kernel_overload = static_castmov(abi_param6, static_cast(m_with_comp)); #endif - // align stack on 16-byte as ABI requires - // note that RBX must not be changed by the callee - h->mov(h->rbx, h->rsp); - h->and_(h->rbx, 0xf); - h->sub(h->rsp, h->rbx); - + internal_call_rsp_align(); h->call(h->rbp); - - h->add(h->rsp, h->rbx); + internal_call_rsp_restore(); #ifdef _WIN32 h->add(h->rsp, num_args_passed_on_stack * gpr_size); #endif - // restore vector registers - for (int i = static_cast(get_max_vecs_count()) - 1; i >= 0; --i) { - h->uni_vmovups(Zmm(i), h->ptr[h->rsp + i * get_vec_length()]); - } - h->add(h->rsp, (get_max_vecs_count()) * get_vec_length()); - // restore k registers - for (int i = n_k_regs_to_save - 1; i >= 0; --i) { - if (mayiuse(avx512_core)) - h->kmovq(Opmask(i), h->ptr[h->rsp + i * k_mask_size]); - else - h->kmovw(Opmask(i), h->ptr[h->rsp + i * k_mask_size]); - } - h->add(h->rsp, n_k_regs_to_save * k_mask_size); - - // restore gpr registers - for (int i = n_gprs_to_save - 1; i >= 0; --i) - h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]); - h->add(h->rsp, n_gprs_to_save * gpr_size); + internal_call_postamble(); } void BrgemmEmitter::kernel_execute(const brgemm_kernel_t *brg_kernel, @@ -1358,32 +1306,7 @@ void BrgemmCopyBEmitter::emit_impl(const std::vector& in, void BrgemmCopyBEmitter::emit_kernel_call(const matmul::jit_brgemm_matmul_copy_b_t* kernel, Reg64 src, Reg64 dst, Reg64 comp, size_t N, size_t K, size_t offset_in, size_t offset_out, size_t offset_comp) const { - Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15, - h->rax, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx}; - size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]); - - h->sub(h->rsp, n_gprs_to_save * gpr_size); - for (size_t i = 0; i < n_gprs_to_save; ++i) - h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]); - - // caller obligation to save k-regs as callee may use them - size_t n_k_regs_to_save = 8; - h->sub(h->rsp, n_k_regs_to_save * k_mask_size); - for (size_t i = 0; i < n_k_regs_to_save; ++i) { - if (mayiuse(avx512_core)) - h->kmovq(h->ptr[h->rsp + i * k_mask_size], Opmask(static_cast(i))); - else - h->kmovw(h->ptr[h->rsp + i * k_mask_size], Opmask(static_cast(i))); - } - - // 1. Caller obligation to save vector registers as callee may use them. - // 2. There is an implicit assumption that the host code uses the same - // `isa` as the injector. Once the assumption is wrong, `vecs_count` and - // `vlen` should be replaced with `host_isa::vlen` and - // `host_isa::vecs_count`. - h->sub(h->rsp, get_max_vecs_count() * get_vec_length()); - for (size_t i = 0; i < get_max_vecs_count(); ++i) - h->uni_vmovups(h->ptr[h->rsp + i * get_vec_length()], Zmm(i)); + internal_call_preamble(); const auto data_ptr = [&](Xmm xmm, Xbyak::Reg64 reg, size_t bytes_offset) { h->uni_vmovq(reg, xmm); @@ -1437,38 +1360,16 @@ void BrgemmCopyBEmitter::emit_kernel_call(const matmul::jit_brgemm_matmul_copy_b h->mov(abi_param5, N); h->mov(abi_param6, K); #endif - // align stack on 16-byte as ABI requires - // note that RBX must not be changed by the callee - h->mov(h->rbx, h->rsp); - h->and_(h->rbx, 0xf); - h->sub(h->rsp, h->rbx); + internal_call_rsp_align(); h->call(h->rbp); - - h->add(h->rsp, h->rbx); + internal_call_rsp_restore(); #ifdef _WIN32 h->add(h->rsp, gpr_size * num_args_passed_on_stack); #endif - // restore vector registers - for (int i = static_cast(get_max_vecs_count()) - 1; i >= 0; --i) { - h->uni_vmovups(Zmm(i), h->ptr[h->rsp + i * get_vec_length()]); - } - h->add(h->rsp, (get_max_vecs_count()) * get_vec_length()); - // restore k registers - for (int i = n_k_regs_to_save - 1; i >= 0; --i) { - if (mayiuse(avx512_core)) - h->kmovq(Opmask(i), h->ptr[h->rsp + i * k_mask_size]); - else - h->kmovw(Opmask(i), h->ptr[h->rsp + i * k_mask_size]); - } - h->add(h->rsp, n_k_regs_to_save * k_mask_size); - - // restore gpr registers - for (int i = n_gprs_to_save - 1; i >= 0; --i) - h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]); - h->add(h->rsp, n_gprs_to_save * gpr_size); + internal_call_postamble(); } void BrgemmCopyBEmitter::execute(matmul::jit_brgemm_matmul_copy_b_t *kernel, const void *src, diff --git a/src/plugins/intel_cpu/src/extension.cpp b/src/plugins/intel_cpu/src/extension.cpp index 8110a4c2fff..d9fe51c5151 100644 --- a/src/plugins/intel_cpu/src/extension.cpp +++ b/src/plugins/intel_cpu/src/extension.cpp @@ -15,6 +15,7 @@ #include "transformations/snippets/x64/op/store_convert.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp" #include "transformations/snippets/x64/op/brgemm_copy_b.hpp" +#include "transformations/snippets/x64/op/perf_count_rdtsc.hpp" #include #include @@ -159,12 +160,16 @@ std::map Extension::getOpSets() { NGRAPH_OP(Subgraph, ov::snippets::op) NGRAPH_OP(VectorBuffer, ov::snippets::op) NGRAPH_OP(RankNormalization, ov::snippets::op) + NGRAPH_OP(PerfCountBegin, ov::snippets::op) + NGRAPH_OP(PerfCountEnd, ov::snippets::op) NGRAPH_OP_X64(LoadConvertSaturation, ov::intel_cpu) NGRAPH_OP_X64(LoadConvertTruncation, ov::intel_cpu) NGRAPH_OP_X64(StoreConvertSaturation, ov::intel_cpu) NGRAPH_OP_X64(StoreConvertTruncation, ov::intel_cpu) NGRAPH_OP_X64(BrgemmCPU, ov::intel_cpu) NGRAPH_OP_X64(BrgemmCopyB, ov::intel_cpu) + NGRAPH_OP_X64(PerfCountRdtscBegin, ov::intel_cpu) + NGRAPH_OP_X64(PerfCountRdtscEnd, ov::intel_cpu) #undef NGRAPH_OP return opset; diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/perf_count_rdtsc.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/perf_count_rdtsc.cpp new file mode 100644 index 00000000000..a3343d5ab74 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/perf_count_rdtsc.cpp @@ -0,0 +1,32 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "perf_count_rdtsc.hpp" + +using namespace ov; +using namespace ov::intel_cpu; + +/////////////////////////PerfCountRdtscBegin////////////////////// +PerfCountRdtscBegin::PerfCountRdtscBegin() : PerfCountBeginBase() { + validate_and_infer_types_except_PerfCountEnd(); +} + +std::shared_ptr PerfCountRdtscBegin::clone_with_new_inputs(const OutputVector& inputs) const { + return std::make_shared(); +} + +/////////////////////////PerfCountRdtscEnd////////////////////// +PerfCountRdtscEnd::PerfCountRdtscEnd(const Output& pc_begin) : ov::snippets::op::PerfCountEndBase({pc_begin}), accumulation(0ul), iteration(0u) { + constructor_validate_and_infer_types(); +} + +std::shared_ptr PerfCountRdtscEnd::clone_with_new_inputs(const OutputVector& inputs) const { + return std::make_shared(inputs.at(0)); +} + +std::shared_ptr PerfCountRdtscEnd::get_pc_begin() { + const auto& pc_begin = ov::as_type_ptr(get_input_source_output(get_input_size() - 1).get_node_shared_ptr()); + OPENVINO_ASSERT(pc_begin != nullptr, "PerfCountRdtscEnd last input is not connected to PerfCountRdtscBegin"); + return pc_begin; +} diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/perf_count_rdtsc.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/perf_count_rdtsc.hpp new file mode 100644 index 00000000000..91f8b82fed5 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/perf_count_rdtsc.hpp @@ -0,0 +1,55 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/op/op.hpp" +#include "snippets/op/perf_count.hpp" + +using namespace ov::snippets::op; + +namespace ov { +namespace intel_cpu { + +/** + * @interface PerfCountRdtscBegin + * @brief Performance count start time via read rdtsc register + * @ingroup snippets + */ +class PerfCountRdtscBegin : public PerfCountBeginBase { +public: + OPENVINO_OP("PerfCountRdtscBegin", "SnippetsOpset", PerfCountBeginBase); + PerfCountRdtscBegin(); + std::shared_ptr clone_with_new_inputs(const OutputVector& inputs) const override; + + uint64_t start_count = 0ul; +}; + +/** + * @interface PerfCountRdtscEnd + * @brief Performance count end time and duration + * @ingroup snippets + */ +class PerfCountRdtscEnd : public PerfCountEndBase { +public: + OPENVINO_OP("PerfCountRdtscEnd", "SnippetsOpset", PerfCountEndBase); + PerfCountRdtscEnd(const Output& pc_begin); + PerfCountRdtscEnd() = default; + ~PerfCountRdtscEnd() { + uint64_t avg = iteration == 0 ? 0 : accumulation / iteration; + std::cout << "accumulation:" << accumulation << " iteration:" << iteration << " avg:" << avg << std::endl; + } + std::shared_ptr clone_with_new_inputs(const OutputVector& inputs) const override; + + std::shared_ptr get_pc_begin(); + // in each call, PerfCountRdtscBegin get start_count. + // in each call, PerfCountRdtscEnd get end_count, then total_duration += end_count - start_count, and iteration++. + // in destructor of PerfCountRdtscEnd, output the perf info + // accumulation is cycle count + uint64_t accumulation = 0ul; + uint32_t iteration = 0u; +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/shape_inference.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/shape_inference.cpp index 6bb833262a5..a9674cc00da 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/shape_inference.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/shape_inference.cpp @@ -9,6 +9,7 @@ #include "op/fused_mul_add.hpp" #include "op/load_convert.hpp" #include "op/store_convert.hpp" +#include "op/perf_count_rdtsc.hpp" #include "transformations/cpu_opset/common/op/swish_cpu.hpp" namespace ov { @@ -38,6 +39,8 @@ const CPUShapeInferSnippetsFactory::TRegistry CPUShapeInferSnippetsFactory::spec SHAPE_INFER_PREDEFINED(ov::intel_cpu::LoadConvertTruncation, PassThroughShapeInfer), SHAPE_INFER_PREDEFINED(ov::intel_cpu::StoreConvertSaturation, PassThroughShapeInfer), SHAPE_INFER_PREDEFINED(ov::intel_cpu::StoreConvertTruncation, PassThroughShapeInfer), + SHAPE_INFER_PREDEFINED(ov::intel_cpu::PerfCountRdtscBegin, EmptyShapeInfer), + SHAPE_INFER_PREDEFINED(ov::intel_cpu::PerfCountRdtscEnd, EmptyShapeInfer), SHAPE_INFER_OP_SPECIFIC_EXTERNAL(ov::intel_cpu::BrgemmCPU, BrgemmShapeInfer), // SHAPE_INFER_OP_SPECIFIC(ov::intel_cpu::BrgemmCopyB),