[SnippetS] Perf count nodes and emitters (#19493)

This commit is contained in:
Chenhu Wang 2023-12-05 19:49:27 +08:00 committed by GitHub
parent 791762fb19
commit f80793e420
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
25 changed files with 781 additions and 113 deletions

View File

@ -29,7 +29,7 @@ class Generator;
class LoweringResult { class LoweringResult {
friend class Generator; friend class Generator;
// Some emitters rely on other precompiled kernels. // Some emitters rely on other precompiled kernels.
// We need to keep the pointers to such emitters alive, so the kernels would still be accessible at runtime. // We need to keep the pointers to such emitters alive, so the kernels or nodes would still be accessible at runtime.
std::vector<std::shared_ptr<Emitter>> m_saved_emitters{}; std::vector<std::shared_ptr<Emitter>> m_saved_emitters{};
public: public:

View File

@ -14,6 +14,18 @@ namespace ov {
namespace snippets { namespace snippets {
namespace lowered { namespace lowered {
// Snippets performance count mode
// Disabled - default, w/o perf count for snippets
// Chrono - perf count with chrono call. This is a universal method, and support multi-thread case to output perf count data for each thread.
// BackendSpecific - perf count provided by backend. This is for device specific requirment.
// For example, in sake of more light overhead and more accurate result, x86 CPU specific mode via read RDTSC register is implemented,
// which take ~50ns, while Chrono mode take 260ns for a pair of perf count start and perf count end execution, on ICX. This mode only support single thread.
enum PerfCountMode {
Disabled,
Chrono,
BackendSpecific,
};
class Config { class Config {
public: public:
// True if the lowered Emitters need to be accessed during runtime. Normally they're destroyed after code emission. // True if the lowered Emitters need to be accessed during runtime. Normally they're destroyed after code emission.
@ -21,6 +33,7 @@ public:
// True if we should check runtime info for nodes to call specific needed transformations // True if we should check runtime info for nodes to call specific needed transformations
bool m_need_fill_tail_register = false; bool m_need_fill_tail_register = false;
size_t m_loop_depth = 1; size_t m_loop_depth = 1;
PerfCountMode perf_count_mode = PerfCountMode::Disabled;
// Some Subgraphs doesn't support domain optimization due to operations' semantics // Some Subgraphs doesn't support domain optimization due to operations' semantics
bool m_enable_domain_optimization = false; bool m_enable_domain_optimization = false;
// Minimal advised work amount for parallel execution. // Minimal advised work amount for parallel execution.

View File

@ -0,0 +1,33 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "pass.hpp"
#include "snippets/op/perf_count.hpp"
namespace ov {
namespace snippets {
namespace lowered {
namespace pass {
/**
* @interface InsertPerfCount
* @brief Insert PerfCountBegin node after last parameter and insert PerfCountEnd node before first result.
* This is a illustration transformation to enable perf count in snippets.
* Developers could modify this to insert perf count pairs around interested sequence of nodes.
* @ingroup snippets
*/
class InsertPerfCount: public Pass {
public:
OPENVINO_RTTI("InsertPerfCount", "Pass")
InsertPerfCount() = default;
bool run(LinearIR& linear_ir) override;
};
} // namespace pass
} // namespace lowered
} // namespace snippets
} // namespace ov

View File

@ -0,0 +1,93 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "openvino/op/op.hpp"
#include "openvino/runtime/threading/thread_local.hpp"
namespace ov {
namespace snippets {
namespace op {
/**
* @interface PerfCountBeginBase
* @brief Base class for PerfCountBegin and PerfCountRdtscBegin(cpu)
* @ingroup snippets
*/
class PerfCountBeginBase : public ov::op::Op {
public:
OPENVINO_OP("PerfCountBeginBase", "SnippetsOpset");
PerfCountBeginBase(const std::vector<Output<Node>>& args);
PerfCountBeginBase() = default;
void validate_and_infer_types() override;
bool visit_attributes(AttributeVisitor& visitor) override;
protected:
void validate_and_infer_types_except_PerfCountEnd();
};
/**
* @interface PerfCountEndBase
* @brief Base class for PerfCountEnd and PerfCountRdtscEnd
* @ingroup snippets
*/
class PerfCountEndBase : public ov::op::Op {
public:
OPENVINO_OP("PerfCountEndBase", "SnippetsOpset");
PerfCountEndBase(const std::vector<Output<Node>>& args);
PerfCountEndBase() = default;
void validate_and_infer_types() override;
bool visit_attributes(AttributeVisitor& visitor) override;
};
/**
* @interface PerfCountBegin
* @brief Performance count start time with chrono call
* @ingroup snippets
*/
class PerfCountBegin : public PerfCountBeginBase {
public:
OPENVINO_OP("PerfCountBegin", "SnippetsOpset", PerfCountBeginBase);
PerfCountBegin();
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override;
void set_start_time();
std::chrono::high_resolution_clock::time_point& get_start_time();
private:
ov::threading::ThreadLocal<std::chrono::high_resolution_clock::time_point> start_time_stamp;
};
/**
* @interface PerfCountEnd
* @brief Performance count end time and duration with chrono call
* @ingroup snippets
*/
class PerfCountEnd : public PerfCountEndBase {
public:
OPENVINO_OP("PerfCountEnd", "SnippetsOpset", PerfCountEndBase);
PerfCountEnd(const Output<Node>& pc_begin);
PerfCountEnd() = default;
~PerfCountEnd() {
output_perf_count();
}
void output_perf_count();
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override;
void init_pc_begin();
void set_accumulated_time();
private:
ov::threading::ThreadLocal<uint64_t> accumulation;
ov::threading::ThreadLocal<uint32_t> iteration;
std::shared_ptr<PerfCountBegin> m_pc_begin = nullptr;
};
} // namespace op
} // namespace snippets
} // namespace ov

View File

@ -25,6 +25,7 @@
#include "op/brgemm.hpp" #include "op/brgemm.hpp"
#include "op/vector_buffer.hpp" #include "op/vector_buffer.hpp"
#include "op/rank_normalization.hpp" #include "op/rank_normalization.hpp"
#include "op/perf_count.hpp"
namespace ov { namespace ov {
namespace snippets { namespace snippets {

View File

@ -24,6 +24,9 @@ OV_OP(Scalar, ov::snippets::op)
OV_OP(Nop, ov::snippets::op) OV_OP(Nop, ov::snippets::op)
OV_OP(RankNormalization, ov::snippets::op) OV_OP(RankNormalization, ov::snippets::op)
OV_OP(PerfCountBegin, ov::snippets::op)
OV_OP(PerfCountEnd, ov::snippets::op)
// Layout-oblivious from opset1 // Layout-oblivious from opset1
// opset completeness // opset completeness

View File

@ -44,7 +44,8 @@ void Generator::generate(lowered::LinearIR& linear_ir, LoweringResult& result, c
} }
OV_ITT_TASK_NEXT(GENERATE, "::GetSnippet") OV_ITT_TASK_NEXT(GENERATE, "::GetSnippet")
// Note: some emitters use precompiled kernels. They need to be saved, so the kernels are accessible at runtime. // 1. some emitters use precompiled kernels. They need to be saved, so the kernels are accessible at runtime.
// 2. perf count node as field of emitter should be alive at runtime.
if (linear_ir.get_config().m_save_expressions) { if (linear_ir.get_config().m_save_expressions) {
for (const auto& expr : linear_ir) { for (const auto& expr : linear_ir) {
const auto& emitter = expr->get_emitter(); const auto& emitter = expr->get_emitter();
@ -66,7 +67,9 @@ Generator::opRegType Generator::get_op_reg_type(const std::shared_ptr<Node>& op)
std::dynamic_pointer_cast<op::LoopEnd>(op) || std::dynamic_pointer_cast<op::LoopEnd>(op) ||
std::dynamic_pointer_cast<op::Brgemm>(op) || std::dynamic_pointer_cast<op::Brgemm>(op) ||
std::dynamic_pointer_cast<op::Buffer>(op) || std::dynamic_pointer_cast<op::Buffer>(op) ||
std::dynamic_pointer_cast<op::RankNormalization>(op)) std::dynamic_pointer_cast<op::RankNormalization>(op) ||
std::dynamic_pointer_cast<op::PerfCountBeginBase>(op) ||
std::dynamic_pointer_cast<op::PerfCountEndBase>(op))
return gpr2gpr; return gpr2gpr;
else if (std::dynamic_pointer_cast<snippets::op::Load>(op) || else if (std::dynamic_pointer_cast<snippets::op::Load>(op) ||
std::dynamic_pointer_cast<snippets::op::BroadcastLoad>(op)) std::dynamic_pointer_cast<snippets::op::BroadcastLoad>(op))

View File

@ -0,0 +1,62 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets/lowered/pass/insert_perf_count.hpp"
#include "snippets/lowered/linear_ir.hpp"
#include "snippets/snippets_isa.hpp"
#include "snippets/itt.hpp"
namespace ov {
namespace snippets {
namespace lowered {
namespace pass {
bool InsertPerfCount::run(LinearIR& linear_ir) {
OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InsertPerfCount")
if (linear_ir.empty())
return false;
auto is_parameter = [](const std::shared_ptr<ov::Node>& node) {
return ov::is_type<ov::op::v0::Parameter>(node);
};
auto is_result = [](const std::shared_ptr<ov::Node>& node) {
return ov::is_type<ov::op::v0::Result>(node);
};
// mark perf_count_begin and perf_count_end position
auto perf_count_begin_pos = linear_ir.cbegin();
auto perf_count_end_pos = perf_count_begin_pos;
bool first_result_marked = false;
for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); expr_it++) {
const auto expr = *expr_it;
const auto& node = expr->get_node();
if (is_parameter(node))
perf_count_begin_pos = expr_it;
if (is_result(node) && !first_result_marked) {
perf_count_end_pos = expr_it;
first_result_marked = true;
}
}
// insert perf_count_begin after last parameter
// linear_ir.insert has insert before behavior, need move to next.
perf_count_begin_pos = std::next(perf_count_begin_pos);
const auto& perf_count_begin = std::make_shared<op::PerfCountBegin>();
const auto& perf_count_begin_expr = linear_ir.create_expression(perf_count_begin, std::vector<PortConnectorPtr>{});
linear_ir.insert(perf_count_begin_pos, perf_count_begin_expr);
// insert perf_count_end before first result
const auto& perf_count_end = std::make_shared<op::PerfCountEnd>(perf_count_begin->output(0));
perf_count_end->set_friendly_name("last_parameter_to_first_result");
const auto& perf_count_end_expr = linear_ir.create_expression(perf_count_end, std::vector<PortConnectorPtr>{});
linear_ir.insert(perf_count_end_pos, perf_count_end_expr);
return true;
}
} // namespace pass
} // namespace lowered
} // namespace snippets
} // namespace ov

View File

@ -0,0 +1,115 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets/op/perf_count.hpp"
namespace ov {
namespace snippets {
namespace op {
/////////////////PerfCountBeginBase/////////////////
PerfCountBeginBase::PerfCountBeginBase(const std::vector<Output<Node>>& args) : Op() {}
void PerfCountBeginBase::validate_and_infer_types() {
validate_and_infer_types_except_PerfCountEnd();
OPENVINO_ASSERT(get_output_size() == 1, "PerfCountBegin must have only one output");
const auto& last_output_inputs = get_output_target_inputs(0);
OPENVINO_ASSERT(last_output_inputs.size() == 1, "PerfCountBegin must have exactly one input attached to the last output");
const auto& pc_end = ov::as_type_ptr<PerfCountEndBase>(last_output_inputs.begin()->get_node()->shared_from_this());
OPENVINO_ASSERT(pc_end != nullptr, "PerfCountBegin must have PerfCountEnd connected to its last output");
}
bool PerfCountBeginBase::visit_attributes(AttributeVisitor &visitor) {
return true;
}
void PerfCountBeginBase::validate_and_infer_types_except_PerfCountEnd() {
NODE_VALIDATION_CHECK(this, get_input_size() == 0, "PerfCountBegin doesn't expect any inputs");
set_output_type(0, element::f32, {});
}
//////////////////PerfCountEndBase/////////////////
PerfCountEndBase::PerfCountEndBase(const std::vector<Output<Node>> &args) : Op(args) {}
void PerfCountEndBase::validate_and_infer_types() {
NODE_VALIDATION_CHECK(this, get_input_size() == 1, "PerfCountEndBase must have one input");
const auto& pc_begin = ov::as_type_ptr<PerfCountBeginBase>(get_input_node_shared_ptr(0));
NODE_VALIDATION_CHECK(this, pc_begin != nullptr, "PerfCountEndBase must have PerfCountBeginBase as the last argument");
set_output_type(0, element::f32, {});
}
bool PerfCountEndBase::visit_attributes(AttributeVisitor &visitor) {
return true;
}
/////////////////PerfCountBegin/////////////////
PerfCountBegin::PerfCountBegin() : PerfCountBeginBase() {
validate_and_infer_types_except_PerfCountEnd();
}
std::shared_ptr<Node> PerfCountBegin::clone_with_new_inputs(const OutputVector& inputs) const {
return std::make_shared<PerfCountBegin>();
}
std::chrono::high_resolution_clock::time_point& PerfCountBegin::get_start_time() {
return start_time_stamp.local();
}
void PerfCountBegin::set_start_time() {
start_time_stamp.local() = std::chrono::high_resolution_clock::now();
}
//////////////////PerfCountEnd///////////////
PerfCountEnd::PerfCountEnd(const Output<Node>& pc_begin) : PerfCountEndBase({pc_begin}), accumulation(0ul), iteration(0u) {
constructor_validate_and_infer_types();
init_pc_begin();
}
std::shared_ptr<Node> PerfCountEnd::clone_with_new_inputs(const OutputVector& inputs) const {
return std::make_shared<PerfCountEnd>(inputs.at(0));
}
void PerfCountEnd::set_accumulated_time() {
auto current_time = std::chrono::high_resolution_clock::now();
auto& start_time = m_pc_begin->get_start_time();
accumulation.local() += std::chrono::duration_cast<std::chrono::nanoseconds>(current_time - start_time).count();
iteration.local()++;
}
void PerfCountEnd::init_pc_begin() {
m_pc_begin = ov::as_type_ptr<PerfCountBegin>(get_input_source_output(get_input_size() - 1).get_node_shared_ptr());
NODE_VALIDATION_CHECK(this, m_pc_begin != nullptr, "PerfCountEnd last input is not connected to PerfCountBegin");
}
void PerfCountEnd::output_perf_count() {
OPENVINO_ASSERT(accumulation.size() == iteration.size(), "accumulation size should be the same as iteration size in perf_count_end node.");
auto iterator_iter = iteration.begin();
auto iterator_acc = accumulation.begin();
int t_num = 0;
uint64_t avg_max = 0;
std::cout << "Perf count data in perfCountEnd node with name " << get_friendly_name() << " is:"<< std::endl;
for (; iterator_iter != iteration.end(); ++iterator_iter, ++iterator_acc) {
const auto iter = *iterator_iter;
const auto acc = *iterator_acc;
uint64_t avg = iter == 0 ? 0 : acc / iter;
if (avg > avg_max)
avg_max = avg;
std::cout << "accumulated time:" << acc << "ns, iteration:" << iter << " avg time:" << avg << "ns"<< " on thread:" << t_num << std::endl;
t_num++;
}
// max time of all threads: combine for reduce max
auto BinaryFunc = [](const uint64_t& a, const uint64_t& b) {
return a >= b ? a : b;
};
// max accumulation
uint64_t acc_max = accumulation.combine(BinaryFunc);
std::cout << "max accumulated time:" << acc_max << "ns" << std::endl;
// max avg
std::cout << "max avg time:" << avg_max << "ns" << std::endl;
}
} // namespace op
} // namespace snippets
} // namespace ov

View File

@ -42,6 +42,7 @@
#include "snippets/lowered/pass/validate_loops.hpp" #include "snippets/lowered/pass/validate_loops.hpp"
#include "snippets/lowered/pass/insert_loops.hpp" #include "snippets/lowered/pass/insert_loops.hpp"
#include "snippets/lowered/pass/optimize_domain.hpp" #include "snippets/lowered/pass/optimize_domain.hpp"
#include "snippets/lowered/pass/insert_perf_count.hpp"
#include "transformations/utils/utils.hpp" #include "transformations/utils/utils.hpp"
@ -349,7 +350,8 @@ VectorDims Subgraph::infer_master_shape() {
std::shared_ptr<lowered::LinearIR> std::shared_ptr<lowered::LinearIR>
Subgraph::convert_body_to_linear_ir(const std::shared_ptr<IShapeInferSnippetsFactory>& shape_infer_factory) { Subgraph::convert_body_to_linear_ir(const std::shared_ptr<IShapeInferSnippetsFactory>& shape_infer_factory) {
lowered::Config lowering_config; lowered::Config lowering_config;
lowering_config.m_save_expressions = config.m_has_domain_sensitive_ops; lowering_config.m_save_expressions = config.m_has_domain_sensitive_ops ||
(lowering_config.perf_count_mode != lowered::PerfCountMode::Disabled);
lowering_config.m_need_fill_tail_register = config.m_has_domain_sensitive_ops; lowering_config.m_need_fill_tail_register = config.m_has_domain_sensitive_ops;
lowering_config.m_loop_depth = tileRank; lowering_config.m_loop_depth = tileRank;
lowering_config.m_enable_domain_optimization = !config.m_has_domain_sensitive_ops; lowering_config.m_enable_domain_optimization = !config.m_has_domain_sensitive_ops;
@ -487,6 +489,10 @@ snippets::Schedule Subgraph::generate_from_linear_ir(const lowered::pass::PassPi
auto linear_ir {*m_linear_ir->clone()}; auto linear_ir {*m_linear_ir->clone()};
LoweringResult lowering_result; LoweringResult lowering_result;
control_flow_transformations(linear_ir, lowering_result, backend_passes_pre_common, backend_passes_post_common); control_flow_transformations(linear_ir, lowering_result, backend_passes_pre_common, backend_passes_post_common);
if (linear_ir.get_config().perf_count_mode == lowered::PerfCountMode::Chrono) {
lowered::pass::InsertPerfCount perf_count_pass;
perf_count_pass.run(linear_ir);
}
m_generator->generate(linear_ir, lowering_result, compile_params); m_generator->generate(linear_ir, lowering_result, compile_params);
VectorDims parallel_exec_domain = linear_ir.get_master_shape(); VectorDims parallel_exec_domain = linear_ir.get_master_shape();

View File

@ -55,6 +55,8 @@ const IShapeInferSnippetsFactory::TRegistry IShapeInferSnippetsFactory::registry
SHAPE_INFER_PREDEFINED(op::Scalar, SingleElementShapeInfer), SHAPE_INFER_PREDEFINED(op::Scalar, SingleElementShapeInfer),
SHAPE_INFER_PREDEFINED(op::VectorBuffer, SingleElementShapeInfer), SHAPE_INFER_PREDEFINED(op::VectorBuffer, SingleElementShapeInfer),
SHAPE_INFER_PREDEFINED(op::LoopEnd, EmptyShapeInfer), SHAPE_INFER_PREDEFINED(op::LoopEnd, EmptyShapeInfer),
SHAPE_INFER_PREDEFINED(op::PerfCountBegin, EmptyShapeInfer),
SHAPE_INFER_PREDEFINED(op::PerfCountEnd, EmptyShapeInfer),
SHAPE_INFER_PREDEFINED(op::Kernel, EmptyShapeInfer), SHAPE_INFER_PREDEFINED(op::Kernel, EmptyShapeInfer),
SHAPE_INFER_PREDEFINED(op::Nop, EmptyShapeInfer), SHAPE_INFER_PREDEFINED(op::Nop, EmptyShapeInfer),
SHAPE_INFER_OP_SPECIFIC_EXTERNAL(opset1::Select, SelectShapeInfer), SHAPE_INFER_OP_SPECIFIC_EXTERNAL(opset1::Select, SelectShapeInfer),

View File

@ -41,6 +41,8 @@ DummyTargetMachine::DummyTargetMachine(const std::vector<ov::Node::type_info_t>&
jitters[ov::snippets::op::Kernel::get_type_info_static()] = dummy_functor; jitters[ov::snippets::op::Kernel::get_type_info_static()] = dummy_functor;
jitters[ov::snippets::op::LoopBegin::get_type_info_static()] = dummy_functor; jitters[ov::snippets::op::LoopBegin::get_type_info_static()] = dummy_functor;
jitters[ov::snippets::op::LoopEnd::get_type_info_static()] = dummy_functor; jitters[ov::snippets::op::LoopEnd::get_type_info_static()] = dummy_functor;
jitters[ov::snippets::op::PerfCountBegin::get_type_info_static()] = dummy_functor;
jitters[ov::snippets::op::PerfCountEnd::get_type_info_static()] = dummy_functor;
jitters[ov::snippets::op::Brgemm::get_type_info_static()] = dummy_functor; jitters[ov::snippets::op::Brgemm::get_type_info_static()] = dummy_functor;
jitters[ov::snippets::op::Buffer::get_type_info_static()] = dummy_functor; jitters[ov::snippets::op::Buffer::get_type_info_static()] = dummy_functor;
jitters[ov::snippets::op::VectorBuffer::get_type_info_static()] = dummy_functor; jitters[ov::snippets::op::VectorBuffer::get_type_info_static()] = dummy_functor;

View File

@ -116,6 +116,20 @@ struct ThreadLocal {
auto end() const -> Iterator<decltype(_map.end())> const { auto end() const -> Iterator<decltype(_map.end())> const {
return {_map.end()}; return {_map.end()};
} }
// CombineFunc has signature T(T,T) or T(const T&, const T&)
template <typename CombineFunc>
T combine(CombineFunc f_combine) {
if (begin() != end()) {
auto ci = begin();
T my_result = *ci;
while (++ci != end())
my_result = f_combine(my_result, *ci);
return my_result;
} else {
return _create();
}
}
}; };
#endif #endif

View File

@ -14,12 +14,15 @@
#include "jit_dnnl_emitters.hpp" #include "jit_dnnl_emitters.hpp"
#include "jit_dnnl_ext_emitters.hpp" #include "jit_dnnl_ext_emitters.hpp"
#include "jit_conversion_emitters.hpp" #include "jit_conversion_emitters.hpp"
#include "jit_perf_count_chrono_emitters.hpp"
#include "jit_perf_count_rdtsc_emitters.hpp"
#include "transformations/snippets/x64/op/load_convert.hpp" #include "transformations/snippets/x64/op/load_convert.hpp"
#include "transformations/snippets/x64/op/store_convert.hpp" #include "transformations/snippets/x64/op/store_convert.hpp"
#include "transformations/snippets/x64/op/fused_mul_add.hpp" #include "transformations/snippets/x64/op/fused_mul_add.hpp"
#include "transformations/snippets/x64/op/brgemm_copy_b.hpp" #include "transformations/snippets/x64/op/brgemm_copy_b.hpp"
#include "transformations/snippets/x64/op/brgemm_cpu.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp"
#include "transformations/snippets/x64/op/perf_count_rdtsc.hpp"
#include "transformations/cpu_opset/common/op/swish_cpu.hpp" #include "transformations/cpu_opset/common/op/swish_cpu.hpp"
#include "transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.hpp" #include "transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.hpp"
@ -157,6 +160,11 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho
jitters[snippets::op::LoopEnd::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(LoopEndEmitter); jitters[snippets::op::LoopEnd::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(LoopEndEmitter);
jitters[intel_cpu::BrgemmCPU::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(BrgemmEmitter); jitters[intel_cpu::BrgemmCPU::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(BrgemmEmitter);
jitters[intel_cpu::BrgemmCopyB::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(BrgemmCopyBEmitter); jitters[intel_cpu::BrgemmCopyB::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(BrgemmCopyBEmitter);
jitters[snippets::op::PerfCountBegin::get_type_info_static()] = CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_chrono_start_emitter);
jitters[snippets::op::PerfCountEnd::get_type_info_static()] = CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_chrono_end_emitter);
jitters[ov::intel_cpu::PerfCountRdtscBegin::get_type_info_static()] = CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_rdtsc_start_emitter);
jitters[ov::intel_cpu::PerfCountRdtscEnd::get_type_info_static()] = CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_rdtsc_end_emitter);
} }
size_t intel_cpu::CPUTargetMachine::get_lanes() const { size_t intel_cpu::CPUTargetMachine::get_lanes() const {
@ -224,6 +232,10 @@ snippets::Generator::opRegType intel_cpu::CPUGenerator::get_specific_op_reg_type
} }
bool intel_cpu::CPUGenerator::uses_precompiled_kernel(const std::shared_ptr<snippets::Emitter>& e) const { bool intel_cpu::CPUGenerator::uses_precompiled_kernel(const std::shared_ptr<snippets::Emitter>& e) const {
return std::dynamic_pointer_cast<intel_cpu::BrgemmEmitter>(e) || return std::dynamic_pointer_cast<intel_cpu::BrgemmEmitter>(e) ||
std::dynamic_pointer_cast<intel_cpu::BrgemmCopyBEmitter>(e); std::dynamic_pointer_cast<intel_cpu::BrgemmCopyBEmitter>(e) ||
std::dynamic_pointer_cast<intel_cpu::jit_perf_count_chrono_start_emitter>(e) ||
std::dynamic_pointer_cast<intel_cpu::jit_perf_count_chrono_end_emitter>(e) ||
std::dynamic_pointer_cast<intel_cpu::jit_perf_count_rdtsc_start_emitter>(e) ||
std::dynamic_pointer_cast<intel_cpu::jit_perf_count_rdtsc_end_emitter>(e);
} }
} // namespace ov } // namespace ov

View File

@ -213,5 +213,73 @@ void jit_emitter::emit_code(const std::vector<size_t> &in_idxs, const std::vecto
emitter_postamble(); emitter_postamble();
} }
void jit_emitter::internal_call_preamble() const {
// gprs
Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15,
h->rax, h->rbx, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp};
size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);
h->sub(h->rsp, n_gprs_to_save * gpr_size);
for (size_t i = 0; i < n_gprs_to_save; ++i)
h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]);
// mask regs
// need preserve based on cpu capability, instead of host isa.
// in case there are possibilty that different isa emitters exist in one subgraph KernelEmitter from perf standpoint in the future.
// e.g. other emitters isa is avx512, while this emitter isa is avx2, and internal call is used. Internal call may use avx512 and spoil k-reg.
// do not care about platform w/ avx512_common but w/o avx512_core(knight landing), which is obsoleted.
if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)) {
h->sub(h->rsp, k_mask_num * k_mask_size);
for (size_t i = 0; i < k_mask_num; ++i) {
h->kmovq(h->ptr[h->rsp + i * k_mask_size], Xbyak::Opmask(static_cast<int>(i)));
}
}
// vector regs
// 1. Caller obligation to save vector registers as callee may use them.
// 2. There is an implicit assumption that the host code uses the same
// `isa` as the injector. Once the assumption is wrong, `vecs_count` and
// `vlen` should be replaced with `host_isa::vlen` and
// `host_isa::vecs_count`.
h->sub(h->rsp, get_max_vecs_count() * get_vec_length());
for (size_t i = 0; i < get_max_vecs_count(); ++i) {
push_vec(h->ptr[h->rsp + i * get_vec_length()], i);
}
}
void jit_emitter::internal_call_postamble() const {
// restore vector registers
for (int i = static_cast<int>(get_max_vecs_count()) - 1; i >= 0; --i) {
pop_vec(static_cast<size_t>(i), h->ptr[h->rsp + i * get_vec_length()]);
}
h->add(h->rsp, (get_max_vecs_count()) * get_vec_length());
// restore k reg
if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)) {
for (int i = k_mask_num - 1; i >= 0; --i) {
h->kmovq(Xbyak::Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
}
h->add(h->rsp, k_mask_num * k_mask_size);
}
// restore gpr registers
Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15,
h->rax, h->rbx, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp};
size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);
for (int i = n_gprs_to_save - 1; i >= 0; --i)
h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]);
h->add(h->rsp, n_gprs_to_save * gpr_size);
}
void jit_emitter::internal_call_rsp_align() const {
h->mov(h->rbx, h->rsp);
h->and_(h->rbx, 0xf);
h->sub(h->rsp, h->rbx);
}
void jit_emitter::internal_call_rsp_restore() const {
h->add(h->rsp, h->rbx);
}
} // namespace intel_cpu } // namespace intel_cpu
} // namespace ov } // namespace ov

View File

@ -106,6 +106,8 @@ protected:
mutable std::vector<size_t> aux_gpr_idxs; mutable std::vector<size_t> aux_gpr_idxs;
static constexpr int k_mask_size = 8; static constexpr int k_mask_size = 8;
static constexpr int k_mask_num = 8;
static constexpr int gpr_size = 8;
Xbyak::Address table_val(std::string key, size_t key_off_val_shift = 0) const { Xbyak::Address table_val(std::string key, size_t key_off_val_shift = 0) const {
auto off = table_off(key, key_off_val_shift); auto off = table_off(key, key_off_val_shift);
@ -130,6 +132,13 @@ protected:
} }
} }
void internal_call_preamble() const;
void internal_call_postamble() const;
// align stack on 16-byte as ABI reqiures
// callee is responsible to save and restore rbx. rbx must not be changed after call callee.
void internal_call_rsp_align() const;
void internal_call_rsp_restore() const;
private: private:
mutable std::vector<size_t> preserved_vec_idxs; mutable std::vector<size_t> preserved_vec_idxs;
mutable std::vector<size_t> preserved_gpr_idxs; mutable std::vector<size_t> preserved_gpr_idxs;

View File

@ -0,0 +1,73 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "jit_emitter.hpp"
#include "jit_perf_count_chrono_emitters.hpp"
#include <cpu/x64/jit_generator.hpp>
using namespace dnnl::impl;
using namespace dnnl::impl::utils;
using namespace dnnl::impl::cpu;
using namespace dnnl::impl::cpu::x64;
using namespace Xbyak;
using namespace Xbyak::util;
namespace ov {
namespace intel_cpu {
jit_perf_count_chrono_start_emitter::jit_perf_count_chrono_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
const std::shared_ptr<ov::Node>& n) : jit_emitter(host, host_isa) {
m_start_node = ov::as_type_ptr<snippets::op::PerfCountBegin>(n);
}
size_t jit_perf_count_chrono_start_emitter::get_inputs_num() const {
return 0;
}
void jit_perf_count_chrono_start_emitter::set_start_time(snippets::op::PerfCountBegin* start_node) {
start_node->set_start_time();
}
void jit_perf_count_chrono_start_emitter::emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const {
internal_call_preamble();
const auto &set_start_time_overload = static_cast<void (*)(snippets::op::PerfCountBegin*)>(set_start_time);
h->mov(h->rax, reinterpret_cast<size_t>(set_start_time_overload));
h->mov(abi_param1, reinterpret_cast<size_t>(m_start_node.get()));
internal_call_rsp_align();
h->call(h->rax);
internal_call_rsp_restore();
internal_call_postamble();
}
///////////////////jit_perf_count_chrono_end_emitter////////////////////////////////////
jit_perf_count_chrono_end_emitter::jit_perf_count_chrono_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
const std::shared_ptr<ov::Node>& n) : jit_emitter(host, host_isa) {
m_end_node = ov::as_type_ptr<snippets::op::PerfCountEnd>(n);
}
size_t jit_perf_count_chrono_end_emitter::get_inputs_num() const {
return 0;
}
void jit_perf_count_chrono_end_emitter::set_accumulated_time(snippets::op::PerfCountEnd* end_node) {
end_node->set_accumulated_time();
}
void jit_perf_count_chrono_end_emitter::emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const {
internal_call_preamble();
const auto &set_accumulated_time_overload = static_cast<void (*)(snippets::op::PerfCountEnd*)>(set_accumulated_time);
h->mov(h->rax, reinterpret_cast<size_t>(set_accumulated_time_overload));
h->mov(abi_param1, reinterpret_cast<size_t>(m_end_node.get()));
internal_call_rsp_align();
h->call(h->rax);
internal_call_rsp_restore();
internal_call_postamble();
}
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,40 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "jit_emitter.hpp"
#include <cpu/x64/jit_generator.hpp>
#include "snippets/op/perf_count.hpp"
namespace ov {
namespace intel_cpu {
class jit_perf_count_chrono_start_emitter : public jit_emitter {
public:
jit_perf_count_chrono_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
const std::shared_ptr<ov::Node>& n);
size_t get_inputs_num() const override;
private:
void emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const override;
static void set_start_time(snippets::op::PerfCountBegin* start_node);
std::shared_ptr<snippets::op::PerfCountBegin> m_start_node = nullptr;
};
class jit_perf_count_chrono_end_emitter : public jit_emitter {
public:
jit_perf_count_chrono_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
const std::shared_ptr<ov::Node>& n);
size_t get_inputs_num() const override;
private:
void emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const override;
static void set_accumulated_time(snippets::op::PerfCountEnd* end_node);
std::shared_ptr<snippets::op::PerfCountEnd> m_end_node = nullptr;
};
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,86 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "jit_emitter.hpp"
#include "jit_perf_count_rdtsc_emitters.hpp"
#include <cpu/x64/jit_generator.hpp>
using namespace dnnl::impl;
using namespace dnnl::impl::utils;
using namespace dnnl::impl::cpu;
using namespace dnnl::impl::cpu::x64;
using namespace Xbyak;
using namespace Xbyak::util;
namespace ov {
namespace intel_cpu {
jit_perf_count_rdtsc_start_emitter::jit_perf_count_rdtsc_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
const std::shared_ptr<ov::Node>& n) : jit_emitter(host, host_isa) {
m_start_node = ov::as_type_ptr<ov::intel_cpu::PerfCountRdtscBegin>(n);
}
size_t jit_perf_count_rdtsc_start_emitter::get_inputs_num() const {
return 0;
}
void jit_perf_count_rdtsc_start_emitter::emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const {
h->push(h->rax);
h->push(h->rdx);
// The EDX register is loaded with the high-order 32 bits of the MSR and the EAX register is loaded with the low-order 32 bits.
h->lfence();
h->rdtsc();
h->lfence();
h->shl(h->rdx, 0x20); // shift to higher half of rdx 0x20(32)
h->or_(h->rdx, h->rax); // rdx has current tsc
h->mov(h->rax, reinterpret_cast<size_t>(&m_start_node->start_count));
h->mov(qword[h->rax], h->rdx);
h->pop(h->rdx);
h->pop(h->rax);
}
///////////////////jit_perf_count_rdtsc_end_emitter////////////////////////////////////
jit_perf_count_rdtsc_end_emitter::jit_perf_count_rdtsc_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
const std::shared_ptr<ov::Node>& n) : jit_emitter(host, host_isa) {
m_end_node = ov::as_type_ptr<ov::intel_cpu::PerfCountRdtscEnd>(n);
}
size_t jit_perf_count_rdtsc_end_emitter::get_inputs_num() const {
return 0;
}
void jit_perf_count_rdtsc_end_emitter::emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const {
h->push(h->rax);
h->push(h->rdx);
h->lfence();
h->rdtsc();
h->lfence();
h->shl(h->rdx, 0x20);
h->or_(h->rdx, h->rax); // rdx has current tsc
// tsc duration
h->mov(h->rax, reinterpret_cast<size_t>(&m_end_node->get_pc_begin()->start_count));
h->sub(h->rdx, qword[h->rax]); // rdx has tsc duration
// accumulation = accumulation + tsc duration
h->mov(h->rax, reinterpret_cast<size_t>(&m_end_node->accumulation));
h->add(h->rdx, qword[h->rax]);
h->mov(qword[h->rax], h->rdx);
// iteration++
h->mov(h->rax, reinterpret_cast<size_t>(&m_end_node->iteration));
h->mov(h->rdx, qword[h->rax]);
h->add(h->rdx, 0x01);
h->mov(qword[h->rax], h->rdx);
h->pop(h->rdx);
h->pop(h->rax);
}
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,37 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "jit_emitter.hpp"
#include <cpu/x64/jit_generator.hpp>
#include "transformations/snippets/x64/op/perf_count_rdtsc.hpp"
namespace ov {
namespace intel_cpu {
class jit_perf_count_rdtsc_start_emitter : public jit_emitter {
public:
jit_perf_count_rdtsc_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
const std::shared_ptr<ov::Node>& n);
size_t get_inputs_num() const override;
private:
void emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const override;
std::shared_ptr<ov::intel_cpu::PerfCountRdtscBegin> m_start_node = nullptr;
};
class jit_perf_count_rdtsc_end_emitter : public jit_emitter {
public:
jit_perf_count_rdtsc_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
const std::shared_ptr<ov::Node>& n);
size_t get_inputs_num() const override;
private:
void emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const override;
std::shared_ptr<ov::intel_cpu::PerfCountRdtscEnd> m_end_node = nullptr;
};
} // namespace intel_cpu
} // namespace ov

View File

@ -26,10 +26,6 @@ using jit_generator = dnnl::impl::cpu::x64::jit_generator;
using cpu_isa_t = dnnl::impl::cpu::x64::cpu_isa_t; using cpu_isa_t = dnnl::impl::cpu::x64::cpu_isa_t;
using ExpressionPtr = ov::snippets::lowered::ExpressionPtr; using ExpressionPtr = ov::snippets::lowered::ExpressionPtr;
namespace {
constexpr size_t gpr_size = 8;
} // namespace
inline static void transform_idxs_to_regs(const std::vector<size_t>& idxs, std::vector<Reg64>& regs) { inline static void transform_idxs_to_regs(const std::vector<size_t>& idxs, std::vector<Reg64>& regs) {
regs.resize(idxs.size()); regs.resize(idxs.size());
std::transform(idxs.begin(), idxs.end(), regs.begin(), [](size_t idx){return Reg64(static_cast<int>(idx));}); std::transform(idxs.begin(), idxs.end(), regs.begin(), [](size_t idx){return Reg64(static_cast<int>(idx));});
@ -1114,32 +1110,7 @@ void BrgemmEmitter::emit_brgemm_kernel_call(const brgemm_kernel_t *brg_kernel, c
h->add(h->rsp, n_gprs_to_save * gpr_size); h->add(h->rsp, n_gprs_to_save * gpr_size);
} }
Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15, internal_call_preamble();
h->rax, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx};
size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);
h->sub(h->rsp, n_gprs_to_save * gpr_size);
for (size_t i = 0; i < n_gprs_to_save; ++i)
h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]);
// caller obligation to save k-regs as callee may use them
size_t n_k_regs_to_save = 8;
h->sub(h->rsp, n_k_regs_to_save * k_mask_size);
for (size_t i = 0; i < n_k_regs_to_save; ++i) {
if (mayiuse(avx512_core))
h->kmovq(h->ptr[h->rsp + i * k_mask_size], Opmask(static_cast<int>(i)));
else
h->kmovw(h->ptr[h->rsp + i * k_mask_size], Opmask(static_cast<int>(i)));
}
// 1. Caller obligation to save vector registers as callee may use them.
// 2. There is an implicit assumption that the host code uses the same
// `isa` as the injector. Once the assumption is wrong, `vecs_count` and
// `vlen` should be replaced with `host_isa::vlen` and
// `host_isa::vecs_count`.
h->sub(h->rsp, get_max_vecs_count() * get_vec_length());
for (size_t i = 0; i < get_max_vecs_count(); ++i)
h->uni_vmovups(h->ptr[h->rsp + i * get_vec_length()], Zmm(i));
// save function address in gpr to pass in call instruction // save function address in gpr to pass in call instruction
const auto& brgemm_kernel_overload = static_cast<void (*)(const brgemm_kernel_t*, const auto& brgemm_kernel_overload = static_cast<void (*)(const brgemm_kernel_t*,
@ -1193,38 +1164,15 @@ void BrgemmEmitter::emit_brgemm_kernel_call(const brgemm_kernel_t *brg_kernel, c
h->mov(abi_param6, static_cast<int>(m_with_comp)); h->mov(abi_param6, static_cast<int>(m_with_comp));
#endif #endif
// align stack on 16-byte as ABI requires internal_call_rsp_align();
// note that RBX must not be changed by the callee
h->mov(h->rbx, h->rsp);
h->and_(h->rbx, 0xf);
h->sub(h->rsp, h->rbx);
h->call(h->rbp); h->call(h->rbp);
internal_call_rsp_restore();
h->add(h->rsp, h->rbx);
#ifdef _WIN32 #ifdef _WIN32
h->add(h->rsp, num_args_passed_on_stack * gpr_size); h->add(h->rsp, num_args_passed_on_stack * gpr_size);
#endif #endif
// restore vector registers
for (int i = static_cast<int>(get_max_vecs_count()) - 1; i >= 0; --i) {
h->uni_vmovups(Zmm(i), h->ptr[h->rsp + i * get_vec_length()]);
}
h->add(h->rsp, (get_max_vecs_count()) * get_vec_length());
// restore k registers internal_call_postamble();
for (int i = n_k_regs_to_save - 1; i >= 0; --i) {
if (mayiuse(avx512_core))
h->kmovq(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
else
h->kmovw(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
}
h->add(h->rsp, n_k_regs_to_save * k_mask_size);
// restore gpr registers
for (int i = n_gprs_to_save - 1; i >= 0; --i)
h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]);
h->add(h->rsp, n_gprs_to_save * gpr_size);
} }
void BrgemmEmitter::kernel_execute(const brgemm_kernel_t *brg_kernel, void BrgemmEmitter::kernel_execute(const brgemm_kernel_t *brg_kernel,
@ -1358,32 +1306,7 @@ void BrgemmCopyBEmitter::emit_impl(const std::vector<size_t>& in,
void BrgemmCopyBEmitter::emit_kernel_call(const matmul::jit_brgemm_matmul_copy_b_t* kernel, Reg64 src, Reg64 dst, Reg64 comp, void BrgemmCopyBEmitter::emit_kernel_call(const matmul::jit_brgemm_matmul_copy_b_t* kernel, Reg64 src, Reg64 dst, Reg64 comp,
size_t N, size_t K, size_t offset_in, size_t offset_out, size_t offset_comp) const { size_t N, size_t K, size_t offset_in, size_t offset_out, size_t offset_comp) const {
Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15, internal_call_preamble();
h->rax, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx};
size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);
h->sub(h->rsp, n_gprs_to_save * gpr_size);
for (size_t i = 0; i < n_gprs_to_save; ++i)
h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]);
// caller obligation to save k-regs as callee may use them
size_t n_k_regs_to_save = 8;
h->sub(h->rsp, n_k_regs_to_save * k_mask_size);
for (size_t i = 0; i < n_k_regs_to_save; ++i) {
if (mayiuse(avx512_core))
h->kmovq(h->ptr[h->rsp + i * k_mask_size], Opmask(static_cast<int>(i)));
else
h->kmovw(h->ptr[h->rsp + i * k_mask_size], Opmask(static_cast<int>(i)));
}
// 1. Caller obligation to save vector registers as callee may use them.
// 2. There is an implicit assumption that the host code uses the same
// `isa` as the injector. Once the assumption is wrong, `vecs_count` and
// `vlen` should be replaced with `host_isa::vlen` and
// `host_isa::vecs_count`.
h->sub(h->rsp, get_max_vecs_count() * get_vec_length());
for (size_t i = 0; i < get_max_vecs_count(); ++i)
h->uni_vmovups(h->ptr[h->rsp + i * get_vec_length()], Zmm(i));
const auto data_ptr = [&](Xmm xmm, Xbyak::Reg64 reg, size_t bytes_offset) { const auto data_ptr = [&](Xmm xmm, Xbyak::Reg64 reg, size_t bytes_offset) {
h->uni_vmovq(reg, xmm); h->uni_vmovq(reg, xmm);
@ -1437,38 +1360,16 @@ void BrgemmCopyBEmitter::emit_kernel_call(const matmul::jit_brgemm_matmul_copy_b
h->mov(abi_param5, N); h->mov(abi_param5, N);
h->mov(abi_param6, K); h->mov(abi_param6, K);
#endif #endif
// align stack on 16-byte as ABI requires
// note that RBX must not be changed by the callee
h->mov(h->rbx, h->rsp);
h->and_(h->rbx, 0xf);
h->sub(h->rsp, h->rbx);
internal_call_rsp_align();
h->call(h->rbp); h->call(h->rbp);
internal_call_rsp_restore();
h->add(h->rsp, h->rbx);
#ifdef _WIN32 #ifdef _WIN32
h->add(h->rsp, gpr_size * num_args_passed_on_stack); h->add(h->rsp, gpr_size * num_args_passed_on_stack);
#endif #endif
// restore vector registers
for (int i = static_cast<int>(get_max_vecs_count()) - 1; i >= 0; --i) {
h->uni_vmovups(Zmm(i), h->ptr[h->rsp + i * get_vec_length()]);
}
h->add(h->rsp, (get_max_vecs_count()) * get_vec_length());
// restore k registers internal_call_postamble();
for (int i = n_k_regs_to_save - 1; i >= 0; --i) {
if (mayiuse(avx512_core))
h->kmovq(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
else
h->kmovw(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
}
h->add(h->rsp, n_k_regs_to_save * k_mask_size);
// restore gpr registers
for (int i = n_gprs_to_save - 1; i >= 0; --i)
h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]);
h->add(h->rsp, n_gprs_to_save * gpr_size);
} }
void BrgemmCopyBEmitter::execute(matmul::jit_brgemm_matmul_copy_b_t *kernel, const void *src, void BrgemmCopyBEmitter::execute(matmul::jit_brgemm_matmul_copy_b_t *kernel, const void *src,

View File

@ -15,6 +15,7 @@
#include "transformations/snippets/x64/op/store_convert.hpp" #include "transformations/snippets/x64/op/store_convert.hpp"
#include "transformations/snippets/x64/op/brgemm_cpu.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp"
#include "transformations/snippets/x64/op/brgemm_copy_b.hpp" #include "transformations/snippets/x64/op/brgemm_copy_b.hpp"
#include "transformations/snippets/x64/op/perf_count_rdtsc.hpp"
#include <ov_ops/augru_cell.hpp> #include <ov_ops/augru_cell.hpp>
#include <ov_ops/augru_sequence.hpp> #include <ov_ops/augru_sequence.hpp>
@ -159,12 +160,16 @@ std::map<std::string, ngraph::OpSet> Extension::getOpSets() {
NGRAPH_OP(Subgraph, ov::snippets::op) NGRAPH_OP(Subgraph, ov::snippets::op)
NGRAPH_OP(VectorBuffer, ov::snippets::op) NGRAPH_OP(VectorBuffer, ov::snippets::op)
NGRAPH_OP(RankNormalization, ov::snippets::op) NGRAPH_OP(RankNormalization, ov::snippets::op)
NGRAPH_OP(PerfCountBegin, ov::snippets::op)
NGRAPH_OP(PerfCountEnd, ov::snippets::op)
NGRAPH_OP_X64(LoadConvertSaturation, ov::intel_cpu) NGRAPH_OP_X64(LoadConvertSaturation, ov::intel_cpu)
NGRAPH_OP_X64(LoadConvertTruncation, ov::intel_cpu) NGRAPH_OP_X64(LoadConvertTruncation, ov::intel_cpu)
NGRAPH_OP_X64(StoreConvertSaturation, ov::intel_cpu) NGRAPH_OP_X64(StoreConvertSaturation, ov::intel_cpu)
NGRAPH_OP_X64(StoreConvertTruncation, ov::intel_cpu) NGRAPH_OP_X64(StoreConvertTruncation, ov::intel_cpu)
NGRAPH_OP_X64(BrgemmCPU, ov::intel_cpu) NGRAPH_OP_X64(BrgemmCPU, ov::intel_cpu)
NGRAPH_OP_X64(BrgemmCopyB, ov::intel_cpu) NGRAPH_OP_X64(BrgemmCopyB, ov::intel_cpu)
NGRAPH_OP_X64(PerfCountRdtscBegin, ov::intel_cpu)
NGRAPH_OP_X64(PerfCountRdtscEnd, ov::intel_cpu)
#undef NGRAPH_OP #undef NGRAPH_OP
return opset; return opset;

View File

@ -0,0 +1,32 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "perf_count_rdtsc.hpp"
using namespace ov;
using namespace ov::intel_cpu;
/////////////////////////PerfCountRdtscBegin//////////////////////
PerfCountRdtscBegin::PerfCountRdtscBegin() : PerfCountBeginBase() {
validate_and_infer_types_except_PerfCountEnd();
}
std::shared_ptr<Node> PerfCountRdtscBegin::clone_with_new_inputs(const OutputVector& inputs) const {
return std::make_shared<PerfCountRdtscBegin>();
}
/////////////////////////PerfCountRdtscEnd//////////////////////
PerfCountRdtscEnd::PerfCountRdtscEnd(const Output<Node>& pc_begin) : ov::snippets::op::PerfCountEndBase({pc_begin}), accumulation(0ul), iteration(0u) {
constructor_validate_and_infer_types();
}
std::shared_ptr<Node> PerfCountRdtscEnd::clone_with_new_inputs(const OutputVector& inputs) const {
return std::make_shared<PerfCountRdtscEnd>(inputs.at(0));
}
std::shared_ptr<PerfCountRdtscBegin> PerfCountRdtscEnd::get_pc_begin() {
const auto& pc_begin = ov::as_type_ptr<PerfCountRdtscBegin>(get_input_source_output(get_input_size() - 1).get_node_shared_ptr());
OPENVINO_ASSERT(pc_begin != nullptr, "PerfCountRdtscEnd last input is not connected to PerfCountRdtscBegin");
return pc_begin;
}

View File

@ -0,0 +1,55 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "openvino/op/op.hpp"
#include "snippets/op/perf_count.hpp"
using namespace ov::snippets::op;
namespace ov {
namespace intel_cpu {
/**
* @interface PerfCountRdtscBegin
* @brief Performance count start time via read rdtsc register
* @ingroup snippets
*/
class PerfCountRdtscBegin : public PerfCountBeginBase {
public:
OPENVINO_OP("PerfCountRdtscBegin", "SnippetsOpset", PerfCountBeginBase);
PerfCountRdtscBegin();
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override;
uint64_t start_count = 0ul;
};
/**
* @interface PerfCountRdtscEnd
* @brief Performance count end time and duration
* @ingroup snippets
*/
class PerfCountRdtscEnd : public PerfCountEndBase {
public:
OPENVINO_OP("PerfCountRdtscEnd", "SnippetsOpset", PerfCountEndBase);
PerfCountRdtscEnd(const Output<Node>& pc_begin);
PerfCountRdtscEnd() = default;
~PerfCountRdtscEnd() {
uint64_t avg = iteration == 0 ? 0 : accumulation / iteration;
std::cout << "accumulation:" << accumulation << " iteration:" << iteration << " avg:" << avg << std::endl;
}
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override;
std::shared_ptr<PerfCountRdtscBegin> get_pc_begin();
// in each call, PerfCountRdtscBegin get start_count.
// in each call, PerfCountRdtscEnd get end_count, then total_duration += end_count - start_count, and iteration++.
// in destructor of PerfCountRdtscEnd, output the perf info
// accumulation is cycle count
uint64_t accumulation = 0ul;
uint32_t iteration = 0u;
};
} // namespace intel_cpu
} // namespace ov

View File

@ -9,6 +9,7 @@
#include "op/fused_mul_add.hpp" #include "op/fused_mul_add.hpp"
#include "op/load_convert.hpp" #include "op/load_convert.hpp"
#include "op/store_convert.hpp" #include "op/store_convert.hpp"
#include "op/perf_count_rdtsc.hpp"
#include "transformations/cpu_opset/common/op/swish_cpu.hpp" #include "transformations/cpu_opset/common/op/swish_cpu.hpp"
namespace ov { namespace ov {
@ -38,6 +39,8 @@ const CPUShapeInferSnippetsFactory::TRegistry CPUShapeInferSnippetsFactory::spec
SHAPE_INFER_PREDEFINED(ov::intel_cpu::LoadConvertTruncation, PassThroughShapeInfer), SHAPE_INFER_PREDEFINED(ov::intel_cpu::LoadConvertTruncation, PassThroughShapeInfer),
SHAPE_INFER_PREDEFINED(ov::intel_cpu::StoreConvertSaturation, PassThroughShapeInfer), SHAPE_INFER_PREDEFINED(ov::intel_cpu::StoreConvertSaturation, PassThroughShapeInfer),
SHAPE_INFER_PREDEFINED(ov::intel_cpu::StoreConvertTruncation, PassThroughShapeInfer), SHAPE_INFER_PREDEFINED(ov::intel_cpu::StoreConvertTruncation, PassThroughShapeInfer),
SHAPE_INFER_PREDEFINED(ov::intel_cpu::PerfCountRdtscBegin, EmptyShapeInfer),
SHAPE_INFER_PREDEFINED(ov::intel_cpu::PerfCountRdtscEnd, EmptyShapeInfer),
SHAPE_INFER_OP_SPECIFIC_EXTERNAL(ov::intel_cpu::BrgemmCPU, BrgemmShapeInfer), SHAPE_INFER_OP_SPECIFIC_EXTERNAL(ov::intel_cpu::BrgemmCPU, BrgemmShapeInfer),
// //
SHAPE_INFER_OP_SPECIFIC(ov::intel_cpu::BrgemmCopyB), SHAPE_INFER_OP_SPECIFIC(ov::intel_cpu::BrgemmCopyB),