[SnippetS] Perf count nodes and emitters (#19493)
This commit is contained in:
parent
791762fb19
commit
f80793e420
@ -29,7 +29,7 @@ class Generator;
|
||||
class LoweringResult {
|
||||
friend class Generator;
|
||||
// Some emitters rely on other precompiled kernels.
|
||||
// We need to keep the pointers to such emitters alive, so the kernels would still be accessible at runtime.
|
||||
// We need to keep the pointers to such emitters alive, so the kernels or nodes would still be accessible at runtime.
|
||||
std::vector<std::shared_ptr<Emitter>> m_saved_emitters{};
|
||||
|
||||
public:
|
||||
|
@ -14,6 +14,18 @@ namespace ov {
|
||||
namespace snippets {
|
||||
namespace lowered {
|
||||
|
||||
// Snippets performance count mode
|
||||
// Disabled - default, w/o perf count for snippets
|
||||
// Chrono - perf count with chrono call. This is a universal method, and support multi-thread case to output perf count data for each thread.
|
||||
// BackendSpecific - perf count provided by backend. This is for device specific requirment.
|
||||
// For example, in sake of more light overhead and more accurate result, x86 CPU specific mode via read RDTSC register is implemented,
|
||||
// which take ~50ns, while Chrono mode take 260ns for a pair of perf count start and perf count end execution, on ICX. This mode only support single thread.
|
||||
enum PerfCountMode {
|
||||
Disabled,
|
||||
Chrono,
|
||||
BackendSpecific,
|
||||
};
|
||||
|
||||
class Config {
|
||||
public:
|
||||
// True if the lowered Emitters need to be accessed during runtime. Normally they're destroyed after code emission.
|
||||
@ -21,6 +33,7 @@ public:
|
||||
// True if we should check runtime info for nodes to call specific needed transformations
|
||||
bool m_need_fill_tail_register = false;
|
||||
size_t m_loop_depth = 1;
|
||||
PerfCountMode perf_count_mode = PerfCountMode::Disabled;
|
||||
// Some Subgraphs doesn't support domain optimization due to operations' semantics
|
||||
bool m_enable_domain_optimization = false;
|
||||
// Minimal advised work amount for parallel execution.
|
||||
|
@ -0,0 +1,33 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "pass.hpp"
|
||||
|
||||
#include "snippets/op/perf_count.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace snippets {
|
||||
namespace lowered {
|
||||
namespace pass {
|
||||
|
||||
/**
|
||||
* @interface InsertPerfCount
|
||||
* @brief Insert PerfCountBegin node after last parameter and insert PerfCountEnd node before first result.
|
||||
* This is a illustration transformation to enable perf count in snippets.
|
||||
* Developers could modify this to insert perf count pairs around interested sequence of nodes.
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class InsertPerfCount: public Pass {
|
||||
public:
|
||||
OPENVINO_RTTI("InsertPerfCount", "Pass")
|
||||
InsertPerfCount() = default;
|
||||
bool run(LinearIR& linear_ir) override;
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
} // namespace lowered
|
||||
} // namespace snippets
|
||||
} // namespace ov
|
93
src/common/snippets/include/snippets/op/perf_count.hpp
Normal file
93
src/common/snippets/include/snippets/op/perf_count.hpp
Normal file
@ -0,0 +1,93 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "openvino/op/op.hpp"
|
||||
#include "openvino/runtime/threading/thread_local.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
|
||||
/**
|
||||
* @interface PerfCountBeginBase
|
||||
* @brief Base class for PerfCountBegin and PerfCountRdtscBegin(cpu)
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class PerfCountBeginBase : public ov::op::Op {
|
||||
public:
|
||||
OPENVINO_OP("PerfCountBeginBase", "SnippetsOpset");
|
||||
PerfCountBeginBase(const std::vector<Output<Node>>& args);
|
||||
PerfCountBeginBase() = default;
|
||||
|
||||
void validate_and_infer_types() override;
|
||||
bool visit_attributes(AttributeVisitor& visitor) override;
|
||||
|
||||
protected:
|
||||
void validate_and_infer_types_except_PerfCountEnd();
|
||||
};
|
||||
|
||||
/**
|
||||
* @interface PerfCountEndBase
|
||||
* @brief Base class for PerfCountEnd and PerfCountRdtscEnd
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class PerfCountEndBase : public ov::op::Op {
|
||||
public:
|
||||
OPENVINO_OP("PerfCountEndBase", "SnippetsOpset");
|
||||
PerfCountEndBase(const std::vector<Output<Node>>& args);
|
||||
PerfCountEndBase() = default;
|
||||
|
||||
void validate_and_infer_types() override;
|
||||
bool visit_attributes(AttributeVisitor& visitor) override;
|
||||
};
|
||||
|
||||
/**
|
||||
* @interface PerfCountBegin
|
||||
* @brief Performance count start time with chrono call
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class PerfCountBegin : public PerfCountBeginBase {
|
||||
public:
|
||||
OPENVINO_OP("PerfCountBegin", "SnippetsOpset", PerfCountBeginBase);
|
||||
PerfCountBegin();
|
||||
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override;
|
||||
|
||||
void set_start_time();
|
||||
std::chrono::high_resolution_clock::time_point& get_start_time();
|
||||
|
||||
private:
|
||||
ov::threading::ThreadLocal<std::chrono::high_resolution_clock::time_point> start_time_stamp;
|
||||
};
|
||||
|
||||
/**
|
||||
* @interface PerfCountEnd
|
||||
* @brief Performance count end time and duration with chrono call
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class PerfCountEnd : public PerfCountEndBase {
|
||||
public:
|
||||
OPENVINO_OP("PerfCountEnd", "SnippetsOpset", PerfCountEndBase);
|
||||
PerfCountEnd(const Output<Node>& pc_begin);
|
||||
PerfCountEnd() = default;
|
||||
~PerfCountEnd() {
|
||||
output_perf_count();
|
||||
}
|
||||
void output_perf_count();
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override;
|
||||
|
||||
void init_pc_begin();
|
||||
void set_accumulated_time();
|
||||
|
||||
private:
|
||||
ov::threading::ThreadLocal<uint64_t> accumulation;
|
||||
ov::threading::ThreadLocal<uint32_t> iteration;
|
||||
std::shared_ptr<PerfCountBegin> m_pc_begin = nullptr;
|
||||
};
|
||||
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ov
|
@ -25,6 +25,7 @@
|
||||
#include "op/brgemm.hpp"
|
||||
#include "op/vector_buffer.hpp"
|
||||
#include "op/rank_normalization.hpp"
|
||||
#include "op/perf_count.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace snippets {
|
||||
|
@ -24,6 +24,9 @@ OV_OP(Scalar, ov::snippets::op)
|
||||
OV_OP(Nop, ov::snippets::op)
|
||||
OV_OP(RankNormalization, ov::snippets::op)
|
||||
|
||||
OV_OP(PerfCountBegin, ov::snippets::op)
|
||||
OV_OP(PerfCountEnd, ov::snippets::op)
|
||||
|
||||
// Layout-oblivious from opset1
|
||||
|
||||
// opset completeness
|
||||
|
@ -44,7 +44,8 @@ void Generator::generate(lowered::LinearIR& linear_ir, LoweringResult& result, c
|
||||
}
|
||||
OV_ITT_TASK_NEXT(GENERATE, "::GetSnippet")
|
||||
|
||||
// Note: some emitters use precompiled kernels. They need to be saved, so the kernels are accessible at runtime.
|
||||
// 1. some emitters use precompiled kernels. They need to be saved, so the kernels are accessible at runtime.
|
||||
// 2. perf count node as field of emitter should be alive at runtime.
|
||||
if (linear_ir.get_config().m_save_expressions) {
|
||||
for (const auto& expr : linear_ir) {
|
||||
const auto& emitter = expr->get_emitter();
|
||||
@ -66,7 +67,9 @@ Generator::opRegType Generator::get_op_reg_type(const std::shared_ptr<Node>& op)
|
||||
std::dynamic_pointer_cast<op::LoopEnd>(op) ||
|
||||
std::dynamic_pointer_cast<op::Brgemm>(op) ||
|
||||
std::dynamic_pointer_cast<op::Buffer>(op) ||
|
||||
std::dynamic_pointer_cast<op::RankNormalization>(op))
|
||||
std::dynamic_pointer_cast<op::RankNormalization>(op) ||
|
||||
std::dynamic_pointer_cast<op::PerfCountBeginBase>(op) ||
|
||||
std::dynamic_pointer_cast<op::PerfCountEndBase>(op))
|
||||
return gpr2gpr;
|
||||
else if (std::dynamic_pointer_cast<snippets::op::Load>(op) ||
|
||||
std::dynamic_pointer_cast<snippets::op::BroadcastLoad>(op))
|
||||
|
62
src/common/snippets/src/lowered/pass/insert_perf_count.cpp
Normal file
62
src/common/snippets/src/lowered/pass/insert_perf_count.cpp
Normal file
@ -0,0 +1,62 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/lowered/pass/insert_perf_count.hpp"
|
||||
#include "snippets/lowered/linear_ir.hpp"
|
||||
#include "snippets/snippets_isa.hpp"
|
||||
#include "snippets/itt.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace snippets {
|
||||
namespace lowered {
|
||||
namespace pass {
|
||||
|
||||
bool InsertPerfCount::run(LinearIR& linear_ir) {
|
||||
OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InsertPerfCount")
|
||||
if (linear_ir.empty())
|
||||
return false;
|
||||
|
||||
auto is_parameter = [](const std::shared_ptr<ov::Node>& node) {
|
||||
return ov::is_type<ov::op::v0::Parameter>(node);
|
||||
};
|
||||
auto is_result = [](const std::shared_ptr<ov::Node>& node) {
|
||||
return ov::is_type<ov::op::v0::Result>(node);
|
||||
};
|
||||
|
||||
// mark perf_count_begin and perf_count_end position
|
||||
auto perf_count_begin_pos = linear_ir.cbegin();
|
||||
auto perf_count_end_pos = perf_count_begin_pos;
|
||||
bool first_result_marked = false;
|
||||
for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); expr_it++) {
|
||||
const auto expr = *expr_it;
|
||||
const auto& node = expr->get_node();
|
||||
if (is_parameter(node))
|
||||
perf_count_begin_pos = expr_it;
|
||||
|
||||
if (is_result(node) && !first_result_marked) {
|
||||
perf_count_end_pos = expr_it;
|
||||
first_result_marked = true;
|
||||
}
|
||||
}
|
||||
|
||||
// insert perf_count_begin after last parameter
|
||||
// linear_ir.insert has insert before behavior, need move to next.
|
||||
perf_count_begin_pos = std::next(perf_count_begin_pos);
|
||||
const auto& perf_count_begin = std::make_shared<op::PerfCountBegin>();
|
||||
const auto& perf_count_begin_expr = linear_ir.create_expression(perf_count_begin, std::vector<PortConnectorPtr>{});
|
||||
linear_ir.insert(perf_count_begin_pos, perf_count_begin_expr);
|
||||
|
||||
// insert perf_count_end before first result
|
||||
const auto& perf_count_end = std::make_shared<op::PerfCountEnd>(perf_count_begin->output(0));
|
||||
perf_count_end->set_friendly_name("last_parameter_to_first_result");
|
||||
const auto& perf_count_end_expr = linear_ir.create_expression(perf_count_end, std::vector<PortConnectorPtr>{});
|
||||
linear_ir.insert(perf_count_end_pos, perf_count_end_expr);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace pass
|
||||
} // namespace lowered
|
||||
} // namespace snippets
|
||||
} // namespace ov
|
115
src/common/snippets/src/op/perf_count.cpp
Normal file
115
src/common/snippets/src/op/perf_count.cpp
Normal file
@ -0,0 +1,115 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/op/perf_count.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
|
||||
/////////////////PerfCountBeginBase/////////////////
|
||||
PerfCountBeginBase::PerfCountBeginBase(const std::vector<Output<Node>>& args) : Op() {}
|
||||
|
||||
void PerfCountBeginBase::validate_and_infer_types() {
|
||||
validate_and_infer_types_except_PerfCountEnd();
|
||||
OPENVINO_ASSERT(get_output_size() == 1, "PerfCountBegin must have only one output");
|
||||
const auto& last_output_inputs = get_output_target_inputs(0);
|
||||
OPENVINO_ASSERT(last_output_inputs.size() == 1, "PerfCountBegin must have exactly one input attached to the last output");
|
||||
const auto& pc_end = ov::as_type_ptr<PerfCountEndBase>(last_output_inputs.begin()->get_node()->shared_from_this());
|
||||
OPENVINO_ASSERT(pc_end != nullptr, "PerfCountBegin must have PerfCountEnd connected to its last output");
|
||||
}
|
||||
|
||||
bool PerfCountBeginBase::visit_attributes(AttributeVisitor &visitor) {
|
||||
return true;
|
||||
}
|
||||
|
||||
void PerfCountBeginBase::validate_and_infer_types_except_PerfCountEnd() {
|
||||
NODE_VALIDATION_CHECK(this, get_input_size() == 0, "PerfCountBegin doesn't expect any inputs");
|
||||
set_output_type(0, element::f32, {});
|
||||
}
|
||||
|
||||
//////////////////PerfCountEndBase/////////////////
|
||||
PerfCountEndBase::PerfCountEndBase(const std::vector<Output<Node>> &args) : Op(args) {}
|
||||
|
||||
void PerfCountEndBase::validate_and_infer_types() {
|
||||
NODE_VALIDATION_CHECK(this, get_input_size() == 1, "PerfCountEndBase must have one input");
|
||||
const auto& pc_begin = ov::as_type_ptr<PerfCountBeginBase>(get_input_node_shared_ptr(0));
|
||||
NODE_VALIDATION_CHECK(this, pc_begin != nullptr, "PerfCountEndBase must have PerfCountBeginBase as the last argument");
|
||||
set_output_type(0, element::f32, {});
|
||||
}
|
||||
|
||||
bool PerfCountEndBase::visit_attributes(AttributeVisitor &visitor) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/////////////////PerfCountBegin/////////////////
|
||||
PerfCountBegin::PerfCountBegin() : PerfCountBeginBase() {
|
||||
validate_and_infer_types_except_PerfCountEnd();
|
||||
}
|
||||
|
||||
std::shared_ptr<Node> PerfCountBegin::clone_with_new_inputs(const OutputVector& inputs) const {
|
||||
return std::make_shared<PerfCountBegin>();
|
||||
}
|
||||
|
||||
std::chrono::high_resolution_clock::time_point& PerfCountBegin::get_start_time() {
|
||||
return start_time_stamp.local();
|
||||
}
|
||||
|
||||
void PerfCountBegin::set_start_time() {
|
||||
start_time_stamp.local() = std::chrono::high_resolution_clock::now();
|
||||
}
|
||||
|
||||
//////////////////PerfCountEnd///////////////
|
||||
PerfCountEnd::PerfCountEnd(const Output<Node>& pc_begin) : PerfCountEndBase({pc_begin}), accumulation(0ul), iteration(0u) {
|
||||
constructor_validate_and_infer_types();
|
||||
init_pc_begin();
|
||||
}
|
||||
|
||||
std::shared_ptr<Node> PerfCountEnd::clone_with_new_inputs(const OutputVector& inputs) const {
|
||||
return std::make_shared<PerfCountEnd>(inputs.at(0));
|
||||
}
|
||||
|
||||
void PerfCountEnd::set_accumulated_time() {
|
||||
auto current_time = std::chrono::high_resolution_clock::now();
|
||||
auto& start_time = m_pc_begin->get_start_time();
|
||||
accumulation.local() += std::chrono::duration_cast<std::chrono::nanoseconds>(current_time - start_time).count();
|
||||
iteration.local()++;
|
||||
}
|
||||
|
||||
void PerfCountEnd::init_pc_begin() {
|
||||
m_pc_begin = ov::as_type_ptr<PerfCountBegin>(get_input_source_output(get_input_size() - 1).get_node_shared_ptr());
|
||||
NODE_VALIDATION_CHECK(this, m_pc_begin != nullptr, "PerfCountEnd last input is not connected to PerfCountBegin");
|
||||
}
|
||||
|
||||
void PerfCountEnd::output_perf_count() {
|
||||
OPENVINO_ASSERT(accumulation.size() == iteration.size(), "accumulation size should be the same as iteration size in perf_count_end node.");
|
||||
auto iterator_iter = iteration.begin();
|
||||
auto iterator_acc = accumulation.begin();
|
||||
int t_num = 0;
|
||||
uint64_t avg_max = 0;
|
||||
std::cout << "Perf count data in perfCountEnd node with name " << get_friendly_name() << " is:"<< std::endl;
|
||||
for (; iterator_iter != iteration.end(); ++iterator_iter, ++iterator_acc) {
|
||||
const auto iter = *iterator_iter;
|
||||
const auto acc = *iterator_acc;
|
||||
uint64_t avg = iter == 0 ? 0 : acc / iter;
|
||||
if (avg > avg_max)
|
||||
avg_max = avg;
|
||||
std::cout << "accumulated time:" << acc << "ns, iteration:" << iter << " avg time:" << avg << "ns"<< " on thread:" << t_num << std::endl;
|
||||
t_num++;
|
||||
}
|
||||
|
||||
// max time of all threads: combine for reduce max
|
||||
auto BinaryFunc = [](const uint64_t& a, const uint64_t& b) {
|
||||
return a >= b ? a : b;
|
||||
};
|
||||
// max accumulation
|
||||
uint64_t acc_max = accumulation.combine(BinaryFunc);
|
||||
std::cout << "max accumulated time:" << acc_max << "ns" << std::endl;
|
||||
// max avg
|
||||
std::cout << "max avg time:" << avg_max << "ns" << std::endl;
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ov
|
@ -42,6 +42,7 @@
|
||||
#include "snippets/lowered/pass/validate_loops.hpp"
|
||||
#include "snippets/lowered/pass/insert_loops.hpp"
|
||||
#include "snippets/lowered/pass/optimize_domain.hpp"
|
||||
#include "snippets/lowered/pass/insert_perf_count.hpp"
|
||||
|
||||
#include "transformations/utils/utils.hpp"
|
||||
|
||||
@ -349,7 +350,8 @@ VectorDims Subgraph::infer_master_shape() {
|
||||
std::shared_ptr<lowered::LinearIR>
|
||||
Subgraph::convert_body_to_linear_ir(const std::shared_ptr<IShapeInferSnippetsFactory>& shape_infer_factory) {
|
||||
lowered::Config lowering_config;
|
||||
lowering_config.m_save_expressions = config.m_has_domain_sensitive_ops;
|
||||
lowering_config.m_save_expressions = config.m_has_domain_sensitive_ops ||
|
||||
(lowering_config.perf_count_mode != lowered::PerfCountMode::Disabled);
|
||||
lowering_config.m_need_fill_tail_register = config.m_has_domain_sensitive_ops;
|
||||
lowering_config.m_loop_depth = tileRank;
|
||||
lowering_config.m_enable_domain_optimization = !config.m_has_domain_sensitive_ops;
|
||||
@ -487,6 +489,10 @@ snippets::Schedule Subgraph::generate_from_linear_ir(const lowered::pass::PassPi
|
||||
auto linear_ir {*m_linear_ir->clone()};
|
||||
LoweringResult lowering_result;
|
||||
control_flow_transformations(linear_ir, lowering_result, backend_passes_pre_common, backend_passes_post_common);
|
||||
if (linear_ir.get_config().perf_count_mode == lowered::PerfCountMode::Chrono) {
|
||||
lowered::pass::InsertPerfCount perf_count_pass;
|
||||
perf_count_pass.run(linear_ir);
|
||||
}
|
||||
m_generator->generate(linear_ir, lowering_result, compile_params);
|
||||
|
||||
VectorDims parallel_exec_domain = linear_ir.get_master_shape();
|
||||
|
@ -55,6 +55,8 @@ const IShapeInferSnippetsFactory::TRegistry IShapeInferSnippetsFactory::registry
|
||||
SHAPE_INFER_PREDEFINED(op::Scalar, SingleElementShapeInfer),
|
||||
SHAPE_INFER_PREDEFINED(op::VectorBuffer, SingleElementShapeInfer),
|
||||
SHAPE_INFER_PREDEFINED(op::LoopEnd, EmptyShapeInfer),
|
||||
SHAPE_INFER_PREDEFINED(op::PerfCountBegin, EmptyShapeInfer),
|
||||
SHAPE_INFER_PREDEFINED(op::PerfCountEnd, EmptyShapeInfer),
|
||||
SHAPE_INFER_PREDEFINED(op::Kernel, EmptyShapeInfer),
|
||||
SHAPE_INFER_PREDEFINED(op::Nop, EmptyShapeInfer),
|
||||
SHAPE_INFER_OP_SPECIFIC_EXTERNAL(opset1::Select, SelectShapeInfer),
|
||||
|
@ -41,6 +41,8 @@ DummyTargetMachine::DummyTargetMachine(const std::vector<ov::Node::type_info_t>&
|
||||
jitters[ov::snippets::op::Kernel::get_type_info_static()] = dummy_functor;
|
||||
jitters[ov::snippets::op::LoopBegin::get_type_info_static()] = dummy_functor;
|
||||
jitters[ov::snippets::op::LoopEnd::get_type_info_static()] = dummy_functor;
|
||||
jitters[ov::snippets::op::PerfCountBegin::get_type_info_static()] = dummy_functor;
|
||||
jitters[ov::snippets::op::PerfCountEnd::get_type_info_static()] = dummy_functor;
|
||||
jitters[ov::snippets::op::Brgemm::get_type_info_static()] = dummy_functor;
|
||||
jitters[ov::snippets::op::Buffer::get_type_info_static()] = dummy_functor;
|
||||
jitters[ov::snippets::op::VectorBuffer::get_type_info_static()] = dummy_functor;
|
||||
|
@ -116,6 +116,20 @@ struct ThreadLocal {
|
||||
auto end() const -> Iterator<decltype(_map.end())> const {
|
||||
return {_map.end()};
|
||||
}
|
||||
|
||||
// CombineFunc has signature T(T,T) or T(const T&, const T&)
|
||||
template <typename CombineFunc>
|
||||
T combine(CombineFunc f_combine) {
|
||||
if (begin() != end()) {
|
||||
auto ci = begin();
|
||||
T my_result = *ci;
|
||||
while (++ci != end())
|
||||
my_result = f_combine(my_result, *ci);
|
||||
return my_result;
|
||||
} else {
|
||||
return _create();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -14,12 +14,15 @@
|
||||
#include "jit_dnnl_emitters.hpp"
|
||||
#include "jit_dnnl_ext_emitters.hpp"
|
||||
#include "jit_conversion_emitters.hpp"
|
||||
#include "jit_perf_count_chrono_emitters.hpp"
|
||||
#include "jit_perf_count_rdtsc_emitters.hpp"
|
||||
|
||||
#include "transformations/snippets/x64/op/load_convert.hpp"
|
||||
#include "transformations/snippets/x64/op/store_convert.hpp"
|
||||
#include "transformations/snippets/x64/op/fused_mul_add.hpp"
|
||||
#include "transformations/snippets/x64/op/brgemm_copy_b.hpp"
|
||||
#include "transformations/snippets/x64/op/brgemm_cpu.hpp"
|
||||
#include "transformations/snippets/x64/op/perf_count_rdtsc.hpp"
|
||||
#include "transformations/cpu_opset/common/op/swish_cpu.hpp"
|
||||
#include "transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.hpp"
|
||||
|
||||
@ -157,6 +160,11 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho
|
||||
jitters[snippets::op::LoopEnd::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(LoopEndEmitter);
|
||||
jitters[intel_cpu::BrgemmCPU::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(BrgemmEmitter);
|
||||
jitters[intel_cpu::BrgemmCopyB::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(BrgemmCopyBEmitter);
|
||||
|
||||
jitters[snippets::op::PerfCountBegin::get_type_info_static()] = CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_chrono_start_emitter);
|
||||
jitters[snippets::op::PerfCountEnd::get_type_info_static()] = CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_chrono_end_emitter);
|
||||
jitters[ov::intel_cpu::PerfCountRdtscBegin::get_type_info_static()] = CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_rdtsc_start_emitter);
|
||||
jitters[ov::intel_cpu::PerfCountRdtscEnd::get_type_info_static()] = CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_rdtsc_end_emitter);
|
||||
}
|
||||
|
||||
size_t intel_cpu::CPUTargetMachine::get_lanes() const {
|
||||
@ -224,6 +232,10 @@ snippets::Generator::opRegType intel_cpu::CPUGenerator::get_specific_op_reg_type
|
||||
}
|
||||
bool intel_cpu::CPUGenerator::uses_precompiled_kernel(const std::shared_ptr<snippets::Emitter>& e) const {
|
||||
return std::dynamic_pointer_cast<intel_cpu::BrgemmEmitter>(e) ||
|
||||
std::dynamic_pointer_cast<intel_cpu::BrgemmCopyBEmitter>(e);
|
||||
std::dynamic_pointer_cast<intel_cpu::BrgemmCopyBEmitter>(e) ||
|
||||
std::dynamic_pointer_cast<intel_cpu::jit_perf_count_chrono_start_emitter>(e) ||
|
||||
std::dynamic_pointer_cast<intel_cpu::jit_perf_count_chrono_end_emitter>(e) ||
|
||||
std::dynamic_pointer_cast<intel_cpu::jit_perf_count_rdtsc_start_emitter>(e) ||
|
||||
std::dynamic_pointer_cast<intel_cpu::jit_perf_count_rdtsc_end_emitter>(e);
|
||||
}
|
||||
} // namespace ov
|
||||
} // namespace ov
|
||||
|
@ -213,5 +213,73 @@ void jit_emitter::emit_code(const std::vector<size_t> &in_idxs, const std::vecto
|
||||
emitter_postamble();
|
||||
}
|
||||
|
||||
void jit_emitter::internal_call_preamble() const {
|
||||
// gprs
|
||||
Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15,
|
||||
h->rax, h->rbx, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp};
|
||||
size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);
|
||||
|
||||
h->sub(h->rsp, n_gprs_to_save * gpr_size);
|
||||
for (size_t i = 0; i < n_gprs_to_save; ++i)
|
||||
h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]);
|
||||
|
||||
// mask regs
|
||||
// need preserve based on cpu capability, instead of host isa.
|
||||
// in case there are possibilty that different isa emitters exist in one subgraph KernelEmitter from perf standpoint in the future.
|
||||
// e.g. other emitters isa is avx512, while this emitter isa is avx2, and internal call is used. Internal call may use avx512 and spoil k-reg.
|
||||
// do not care about platform w/ avx512_common but w/o avx512_core(knight landing), which is obsoleted.
|
||||
if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)) {
|
||||
h->sub(h->rsp, k_mask_num * k_mask_size);
|
||||
for (size_t i = 0; i < k_mask_num; ++i) {
|
||||
h->kmovq(h->ptr[h->rsp + i * k_mask_size], Xbyak::Opmask(static_cast<int>(i)));
|
||||
}
|
||||
}
|
||||
|
||||
// vector regs
|
||||
// 1. Caller obligation to save vector registers as callee may use them.
|
||||
// 2. There is an implicit assumption that the host code uses the same
|
||||
// `isa` as the injector. Once the assumption is wrong, `vecs_count` and
|
||||
// `vlen` should be replaced with `host_isa::vlen` and
|
||||
// `host_isa::vecs_count`.
|
||||
h->sub(h->rsp, get_max_vecs_count() * get_vec_length());
|
||||
for (size_t i = 0; i < get_max_vecs_count(); ++i) {
|
||||
push_vec(h->ptr[h->rsp + i * get_vec_length()], i);
|
||||
}
|
||||
}
|
||||
|
||||
void jit_emitter::internal_call_postamble() const {
|
||||
// restore vector registers
|
||||
for (int i = static_cast<int>(get_max_vecs_count()) - 1; i >= 0; --i) {
|
||||
pop_vec(static_cast<size_t>(i), h->ptr[h->rsp + i * get_vec_length()]);
|
||||
}
|
||||
h->add(h->rsp, (get_max_vecs_count()) * get_vec_length());
|
||||
|
||||
// restore k reg
|
||||
if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)) {
|
||||
for (int i = k_mask_num - 1; i >= 0; --i) {
|
||||
h->kmovq(Xbyak::Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
|
||||
}
|
||||
h->add(h->rsp, k_mask_num * k_mask_size);
|
||||
}
|
||||
|
||||
// restore gpr registers
|
||||
Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15,
|
||||
h->rax, h->rbx, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp};
|
||||
size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);
|
||||
for (int i = n_gprs_to_save - 1; i >= 0; --i)
|
||||
h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]);
|
||||
h->add(h->rsp, n_gprs_to_save * gpr_size);
|
||||
}
|
||||
|
||||
void jit_emitter::internal_call_rsp_align() const {
|
||||
h->mov(h->rbx, h->rsp);
|
||||
h->and_(h->rbx, 0xf);
|
||||
h->sub(h->rsp, h->rbx);
|
||||
}
|
||||
|
||||
void jit_emitter::internal_call_rsp_restore() const {
|
||||
h->add(h->rsp, h->rbx);
|
||||
}
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
||||
|
@ -106,6 +106,8 @@ protected:
|
||||
mutable std::vector<size_t> aux_gpr_idxs;
|
||||
|
||||
static constexpr int k_mask_size = 8;
|
||||
static constexpr int k_mask_num = 8;
|
||||
static constexpr int gpr_size = 8;
|
||||
|
||||
Xbyak::Address table_val(std::string key, size_t key_off_val_shift = 0) const {
|
||||
auto off = table_off(key, key_off_val_shift);
|
||||
@ -130,6 +132,13 @@ protected:
|
||||
}
|
||||
}
|
||||
|
||||
void internal_call_preamble() const;
|
||||
void internal_call_postamble() const;
|
||||
// align stack on 16-byte as ABI reqiures
|
||||
// callee is responsible to save and restore rbx. rbx must not be changed after call callee.
|
||||
void internal_call_rsp_align() const;
|
||||
void internal_call_rsp_restore() const;
|
||||
|
||||
private:
|
||||
mutable std::vector<size_t> preserved_vec_idxs;
|
||||
mutable std::vector<size_t> preserved_gpr_idxs;
|
||||
|
@ -0,0 +1,73 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "jit_emitter.hpp"
|
||||
#include "jit_perf_count_chrono_emitters.hpp"
|
||||
#include <cpu/x64/jit_generator.hpp>
|
||||
|
||||
using namespace dnnl::impl;
|
||||
using namespace dnnl::impl::utils;
|
||||
using namespace dnnl::impl::cpu;
|
||||
using namespace dnnl::impl::cpu::x64;
|
||||
using namespace Xbyak;
|
||||
using namespace Xbyak::util;
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
jit_perf_count_chrono_start_emitter::jit_perf_count_chrono_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
|
||||
const std::shared_ptr<ov::Node>& n) : jit_emitter(host, host_isa) {
|
||||
m_start_node = ov::as_type_ptr<snippets::op::PerfCountBegin>(n);
|
||||
}
|
||||
|
||||
size_t jit_perf_count_chrono_start_emitter::get_inputs_num() const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
void jit_perf_count_chrono_start_emitter::set_start_time(snippets::op::PerfCountBegin* start_node) {
|
||||
start_node->set_start_time();
|
||||
}
|
||||
|
||||
void jit_perf_count_chrono_start_emitter::emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const {
|
||||
internal_call_preamble();
|
||||
|
||||
const auto &set_start_time_overload = static_cast<void (*)(snippets::op::PerfCountBegin*)>(set_start_time);
|
||||
h->mov(h->rax, reinterpret_cast<size_t>(set_start_time_overload));
|
||||
h->mov(abi_param1, reinterpret_cast<size_t>(m_start_node.get()));
|
||||
internal_call_rsp_align();
|
||||
h->call(h->rax);
|
||||
internal_call_rsp_restore();
|
||||
|
||||
internal_call_postamble();
|
||||
}
|
||||
|
||||
///////////////////jit_perf_count_chrono_end_emitter////////////////////////////////////
|
||||
jit_perf_count_chrono_end_emitter::jit_perf_count_chrono_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
|
||||
const std::shared_ptr<ov::Node>& n) : jit_emitter(host, host_isa) {
|
||||
m_end_node = ov::as_type_ptr<snippets::op::PerfCountEnd>(n);
|
||||
}
|
||||
|
||||
size_t jit_perf_count_chrono_end_emitter::get_inputs_num() const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
void jit_perf_count_chrono_end_emitter::set_accumulated_time(snippets::op::PerfCountEnd* end_node) {
|
||||
end_node->set_accumulated_time();
|
||||
}
|
||||
|
||||
void jit_perf_count_chrono_end_emitter::emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const {
|
||||
internal_call_preamble();
|
||||
|
||||
const auto &set_accumulated_time_overload = static_cast<void (*)(snippets::op::PerfCountEnd*)>(set_accumulated_time);
|
||||
h->mov(h->rax, reinterpret_cast<size_t>(set_accumulated_time_overload));
|
||||
h->mov(abi_param1, reinterpret_cast<size_t>(m_end_node.get()));
|
||||
internal_call_rsp_align();
|
||||
h->call(h->rax);
|
||||
internal_call_rsp_restore();
|
||||
|
||||
internal_call_postamble();
|
||||
}
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
@ -0,0 +1,40 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "jit_emitter.hpp"
|
||||
#include <cpu/x64/jit_generator.hpp>
|
||||
|
||||
#include "snippets/op/perf_count.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
class jit_perf_count_chrono_start_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_perf_count_chrono_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
|
||||
const std::shared_ptr<ov::Node>& n);
|
||||
size_t get_inputs_num() const override;
|
||||
|
||||
private:
|
||||
void emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const override;
|
||||
static void set_start_time(snippets::op::PerfCountBegin* start_node);
|
||||
std::shared_ptr<snippets::op::PerfCountBegin> m_start_node = nullptr;
|
||||
};
|
||||
|
||||
class jit_perf_count_chrono_end_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_perf_count_chrono_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
|
||||
const std::shared_ptr<ov::Node>& n);
|
||||
size_t get_inputs_num() const override;
|
||||
|
||||
private:
|
||||
void emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const override;
|
||||
static void set_accumulated_time(snippets::op::PerfCountEnd* end_node);
|
||||
std::shared_ptr<snippets::op::PerfCountEnd> m_end_node = nullptr;
|
||||
};
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
@ -0,0 +1,86 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "jit_emitter.hpp"
|
||||
#include "jit_perf_count_rdtsc_emitters.hpp"
|
||||
#include <cpu/x64/jit_generator.hpp>
|
||||
|
||||
using namespace dnnl::impl;
|
||||
using namespace dnnl::impl::utils;
|
||||
using namespace dnnl::impl::cpu;
|
||||
using namespace dnnl::impl::cpu::x64;
|
||||
using namespace Xbyak;
|
||||
using namespace Xbyak::util;
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
jit_perf_count_rdtsc_start_emitter::jit_perf_count_rdtsc_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
|
||||
const std::shared_ptr<ov::Node>& n) : jit_emitter(host, host_isa) {
|
||||
m_start_node = ov::as_type_ptr<ov::intel_cpu::PerfCountRdtscBegin>(n);
|
||||
}
|
||||
|
||||
size_t jit_perf_count_rdtsc_start_emitter::get_inputs_num() const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
void jit_perf_count_rdtsc_start_emitter::emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const {
|
||||
h->push(h->rax);
|
||||
h->push(h->rdx);
|
||||
|
||||
// The EDX register is loaded with the high-order 32 bits of the MSR and the EAX register is loaded with the low-order 32 bits.
|
||||
h->lfence();
|
||||
h->rdtsc();
|
||||
h->lfence();
|
||||
h->shl(h->rdx, 0x20); // shift to higher half of rdx 0x20(32)
|
||||
h->or_(h->rdx, h->rax); // rdx has current tsc
|
||||
|
||||
h->mov(h->rax, reinterpret_cast<size_t>(&m_start_node->start_count));
|
||||
h->mov(qword[h->rax], h->rdx);
|
||||
|
||||
h->pop(h->rdx);
|
||||
h->pop(h->rax);
|
||||
}
|
||||
|
||||
///////////////////jit_perf_count_rdtsc_end_emitter////////////////////////////////////
|
||||
jit_perf_count_rdtsc_end_emitter::jit_perf_count_rdtsc_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
|
||||
const std::shared_ptr<ov::Node>& n) : jit_emitter(host, host_isa) {
|
||||
m_end_node = ov::as_type_ptr<ov::intel_cpu::PerfCountRdtscEnd>(n);
|
||||
}
|
||||
|
||||
size_t jit_perf_count_rdtsc_end_emitter::get_inputs_num() const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
void jit_perf_count_rdtsc_end_emitter::emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const {
|
||||
h->push(h->rax);
|
||||
h->push(h->rdx);
|
||||
|
||||
h->lfence();
|
||||
h->rdtsc();
|
||||
h->lfence();
|
||||
h->shl(h->rdx, 0x20);
|
||||
h->or_(h->rdx, h->rax); // rdx has current tsc
|
||||
|
||||
// tsc duration
|
||||
h->mov(h->rax, reinterpret_cast<size_t>(&m_end_node->get_pc_begin()->start_count));
|
||||
h->sub(h->rdx, qword[h->rax]); // rdx has tsc duration
|
||||
|
||||
// accumulation = accumulation + tsc duration
|
||||
h->mov(h->rax, reinterpret_cast<size_t>(&m_end_node->accumulation));
|
||||
h->add(h->rdx, qword[h->rax]);
|
||||
h->mov(qword[h->rax], h->rdx);
|
||||
|
||||
// iteration++
|
||||
h->mov(h->rax, reinterpret_cast<size_t>(&m_end_node->iteration));
|
||||
h->mov(h->rdx, qword[h->rax]);
|
||||
h->add(h->rdx, 0x01);
|
||||
h->mov(qword[h->rax], h->rdx);
|
||||
|
||||
h->pop(h->rdx);
|
||||
h->pop(h->rax);
|
||||
}
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
@ -0,0 +1,37 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "jit_emitter.hpp"
|
||||
#include <cpu/x64/jit_generator.hpp>
|
||||
#include "transformations/snippets/x64/op/perf_count_rdtsc.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
class jit_perf_count_rdtsc_start_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_perf_count_rdtsc_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
|
||||
const std::shared_ptr<ov::Node>& n);
|
||||
size_t get_inputs_num() const override;
|
||||
|
||||
private:
|
||||
void emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const override;
|
||||
std::shared_ptr<ov::intel_cpu::PerfCountRdtscBegin> m_start_node = nullptr;
|
||||
};
|
||||
|
||||
class jit_perf_count_rdtsc_end_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_perf_count_rdtsc_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
|
||||
const std::shared_ptr<ov::Node>& n);
|
||||
size_t get_inputs_num() const override;
|
||||
|
||||
private:
|
||||
void emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const override;
|
||||
std::shared_ptr<ov::intel_cpu::PerfCountRdtscEnd> m_end_node = nullptr;
|
||||
};
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
@ -26,10 +26,6 @@ using jit_generator = dnnl::impl::cpu::x64::jit_generator;
|
||||
using cpu_isa_t = dnnl::impl::cpu::x64::cpu_isa_t;
|
||||
using ExpressionPtr = ov::snippets::lowered::ExpressionPtr;
|
||||
|
||||
namespace {
|
||||
constexpr size_t gpr_size = 8;
|
||||
} // namespace
|
||||
|
||||
inline static void transform_idxs_to_regs(const std::vector<size_t>& idxs, std::vector<Reg64>& regs) {
|
||||
regs.resize(idxs.size());
|
||||
std::transform(idxs.begin(), idxs.end(), regs.begin(), [](size_t idx){return Reg64(static_cast<int>(idx));});
|
||||
@ -1114,32 +1110,7 @@ void BrgemmEmitter::emit_brgemm_kernel_call(const brgemm_kernel_t *brg_kernel, c
|
||||
h->add(h->rsp, n_gprs_to_save * gpr_size);
|
||||
}
|
||||
|
||||
Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15,
|
||||
h->rax, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx};
|
||||
size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);
|
||||
|
||||
h->sub(h->rsp, n_gprs_to_save * gpr_size);
|
||||
for (size_t i = 0; i < n_gprs_to_save; ++i)
|
||||
h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]);
|
||||
|
||||
// caller obligation to save k-regs as callee may use them
|
||||
size_t n_k_regs_to_save = 8;
|
||||
h->sub(h->rsp, n_k_regs_to_save * k_mask_size);
|
||||
for (size_t i = 0; i < n_k_regs_to_save; ++i) {
|
||||
if (mayiuse(avx512_core))
|
||||
h->kmovq(h->ptr[h->rsp + i * k_mask_size], Opmask(static_cast<int>(i)));
|
||||
else
|
||||
h->kmovw(h->ptr[h->rsp + i * k_mask_size], Opmask(static_cast<int>(i)));
|
||||
}
|
||||
|
||||
// 1. Caller obligation to save vector registers as callee may use them.
|
||||
// 2. There is an implicit assumption that the host code uses the same
|
||||
// `isa` as the injector. Once the assumption is wrong, `vecs_count` and
|
||||
// `vlen` should be replaced with `host_isa::vlen` and
|
||||
// `host_isa::vecs_count`.
|
||||
h->sub(h->rsp, get_max_vecs_count() * get_vec_length());
|
||||
for (size_t i = 0; i < get_max_vecs_count(); ++i)
|
||||
h->uni_vmovups(h->ptr[h->rsp + i * get_vec_length()], Zmm(i));
|
||||
internal_call_preamble();
|
||||
|
||||
// save function address in gpr to pass in call instruction
|
||||
const auto& brgemm_kernel_overload = static_cast<void (*)(const brgemm_kernel_t*,
|
||||
@ -1193,38 +1164,15 @@ void BrgemmEmitter::emit_brgemm_kernel_call(const brgemm_kernel_t *brg_kernel, c
|
||||
h->mov(abi_param6, static_cast<int>(m_with_comp));
|
||||
#endif
|
||||
|
||||
// align stack on 16-byte as ABI requires
|
||||
// note that RBX must not be changed by the callee
|
||||
h->mov(h->rbx, h->rsp);
|
||||
h->and_(h->rbx, 0xf);
|
||||
h->sub(h->rsp, h->rbx);
|
||||
|
||||
internal_call_rsp_align();
|
||||
h->call(h->rbp);
|
||||
|
||||
h->add(h->rsp, h->rbx);
|
||||
internal_call_rsp_restore();
|
||||
|
||||
#ifdef _WIN32
|
||||
h->add(h->rsp, num_args_passed_on_stack * gpr_size);
|
||||
#endif
|
||||
// restore vector registers
|
||||
for (int i = static_cast<int>(get_max_vecs_count()) - 1; i >= 0; --i) {
|
||||
h->uni_vmovups(Zmm(i), h->ptr[h->rsp + i * get_vec_length()]);
|
||||
}
|
||||
h->add(h->rsp, (get_max_vecs_count()) * get_vec_length());
|
||||
|
||||
// restore k registers
|
||||
for (int i = n_k_regs_to_save - 1; i >= 0; --i) {
|
||||
if (mayiuse(avx512_core))
|
||||
h->kmovq(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
|
||||
else
|
||||
h->kmovw(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
|
||||
}
|
||||
h->add(h->rsp, n_k_regs_to_save * k_mask_size);
|
||||
|
||||
// restore gpr registers
|
||||
for (int i = n_gprs_to_save - 1; i >= 0; --i)
|
||||
h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]);
|
||||
h->add(h->rsp, n_gprs_to_save * gpr_size);
|
||||
internal_call_postamble();
|
||||
}
|
||||
|
||||
void BrgemmEmitter::kernel_execute(const brgemm_kernel_t *brg_kernel,
|
||||
@ -1358,32 +1306,7 @@ void BrgemmCopyBEmitter::emit_impl(const std::vector<size_t>& in,
|
||||
|
||||
void BrgemmCopyBEmitter::emit_kernel_call(const matmul::jit_brgemm_matmul_copy_b_t* kernel, Reg64 src, Reg64 dst, Reg64 comp,
|
||||
size_t N, size_t K, size_t offset_in, size_t offset_out, size_t offset_comp) const {
|
||||
Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15,
|
||||
h->rax, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx};
|
||||
size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);
|
||||
|
||||
h->sub(h->rsp, n_gprs_to_save * gpr_size);
|
||||
for (size_t i = 0; i < n_gprs_to_save; ++i)
|
||||
h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]);
|
||||
|
||||
// caller obligation to save k-regs as callee may use them
|
||||
size_t n_k_regs_to_save = 8;
|
||||
h->sub(h->rsp, n_k_regs_to_save * k_mask_size);
|
||||
for (size_t i = 0; i < n_k_regs_to_save; ++i) {
|
||||
if (mayiuse(avx512_core))
|
||||
h->kmovq(h->ptr[h->rsp + i * k_mask_size], Opmask(static_cast<int>(i)));
|
||||
else
|
||||
h->kmovw(h->ptr[h->rsp + i * k_mask_size], Opmask(static_cast<int>(i)));
|
||||
}
|
||||
|
||||
// 1. Caller obligation to save vector registers as callee may use them.
|
||||
// 2. There is an implicit assumption that the host code uses the same
|
||||
// `isa` as the injector. Once the assumption is wrong, `vecs_count` and
|
||||
// `vlen` should be replaced with `host_isa::vlen` and
|
||||
// `host_isa::vecs_count`.
|
||||
h->sub(h->rsp, get_max_vecs_count() * get_vec_length());
|
||||
for (size_t i = 0; i < get_max_vecs_count(); ++i)
|
||||
h->uni_vmovups(h->ptr[h->rsp + i * get_vec_length()], Zmm(i));
|
||||
internal_call_preamble();
|
||||
|
||||
const auto data_ptr = [&](Xmm xmm, Xbyak::Reg64 reg, size_t bytes_offset) {
|
||||
h->uni_vmovq(reg, xmm);
|
||||
@ -1437,38 +1360,16 @@ void BrgemmCopyBEmitter::emit_kernel_call(const matmul::jit_brgemm_matmul_copy_b
|
||||
h->mov(abi_param5, N);
|
||||
h->mov(abi_param6, K);
|
||||
#endif
|
||||
// align stack on 16-byte as ABI requires
|
||||
// note that RBX must not be changed by the callee
|
||||
h->mov(h->rbx, h->rsp);
|
||||
h->and_(h->rbx, 0xf);
|
||||
h->sub(h->rsp, h->rbx);
|
||||
|
||||
internal_call_rsp_align();
|
||||
h->call(h->rbp);
|
||||
|
||||
h->add(h->rsp, h->rbx);
|
||||
internal_call_rsp_restore();
|
||||
|
||||
#ifdef _WIN32
|
||||
h->add(h->rsp, gpr_size * num_args_passed_on_stack);
|
||||
#endif
|
||||
// restore vector registers
|
||||
for (int i = static_cast<int>(get_max_vecs_count()) - 1; i >= 0; --i) {
|
||||
h->uni_vmovups(Zmm(i), h->ptr[h->rsp + i * get_vec_length()]);
|
||||
}
|
||||
h->add(h->rsp, (get_max_vecs_count()) * get_vec_length());
|
||||
|
||||
// restore k registers
|
||||
for (int i = n_k_regs_to_save - 1; i >= 0; --i) {
|
||||
if (mayiuse(avx512_core))
|
||||
h->kmovq(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
|
||||
else
|
||||
h->kmovw(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
|
||||
}
|
||||
h->add(h->rsp, n_k_regs_to_save * k_mask_size);
|
||||
|
||||
// restore gpr registers
|
||||
for (int i = n_gprs_to_save - 1; i >= 0; --i)
|
||||
h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]);
|
||||
h->add(h->rsp, n_gprs_to_save * gpr_size);
|
||||
internal_call_postamble();
|
||||
}
|
||||
|
||||
void BrgemmCopyBEmitter::execute(matmul::jit_brgemm_matmul_copy_b_t *kernel, const void *src,
|
||||
|
@ -15,6 +15,7 @@
|
||||
#include "transformations/snippets/x64/op/store_convert.hpp"
|
||||
#include "transformations/snippets/x64/op/brgemm_cpu.hpp"
|
||||
#include "transformations/snippets/x64/op/brgemm_copy_b.hpp"
|
||||
#include "transformations/snippets/x64/op/perf_count_rdtsc.hpp"
|
||||
|
||||
#include <ov_ops/augru_cell.hpp>
|
||||
#include <ov_ops/augru_sequence.hpp>
|
||||
@ -159,12 +160,16 @@ std::map<std::string, ngraph::OpSet> Extension::getOpSets() {
|
||||
NGRAPH_OP(Subgraph, ov::snippets::op)
|
||||
NGRAPH_OP(VectorBuffer, ov::snippets::op)
|
||||
NGRAPH_OP(RankNormalization, ov::snippets::op)
|
||||
NGRAPH_OP(PerfCountBegin, ov::snippets::op)
|
||||
NGRAPH_OP(PerfCountEnd, ov::snippets::op)
|
||||
NGRAPH_OP_X64(LoadConvertSaturation, ov::intel_cpu)
|
||||
NGRAPH_OP_X64(LoadConvertTruncation, ov::intel_cpu)
|
||||
NGRAPH_OP_X64(StoreConvertSaturation, ov::intel_cpu)
|
||||
NGRAPH_OP_X64(StoreConvertTruncation, ov::intel_cpu)
|
||||
NGRAPH_OP_X64(BrgemmCPU, ov::intel_cpu)
|
||||
NGRAPH_OP_X64(BrgemmCopyB, ov::intel_cpu)
|
||||
NGRAPH_OP_X64(PerfCountRdtscBegin, ov::intel_cpu)
|
||||
NGRAPH_OP_X64(PerfCountRdtscEnd, ov::intel_cpu)
|
||||
#undef NGRAPH_OP
|
||||
|
||||
return opset;
|
||||
|
@ -0,0 +1,32 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "perf_count_rdtsc.hpp"
|
||||
|
||||
using namespace ov;
|
||||
using namespace ov::intel_cpu;
|
||||
|
||||
/////////////////////////PerfCountRdtscBegin//////////////////////
|
||||
PerfCountRdtscBegin::PerfCountRdtscBegin() : PerfCountBeginBase() {
|
||||
validate_and_infer_types_except_PerfCountEnd();
|
||||
}
|
||||
|
||||
std::shared_ptr<Node> PerfCountRdtscBegin::clone_with_new_inputs(const OutputVector& inputs) const {
|
||||
return std::make_shared<PerfCountRdtscBegin>();
|
||||
}
|
||||
|
||||
/////////////////////////PerfCountRdtscEnd//////////////////////
|
||||
PerfCountRdtscEnd::PerfCountRdtscEnd(const Output<Node>& pc_begin) : ov::snippets::op::PerfCountEndBase({pc_begin}), accumulation(0ul), iteration(0u) {
|
||||
constructor_validate_and_infer_types();
|
||||
}
|
||||
|
||||
std::shared_ptr<Node> PerfCountRdtscEnd::clone_with_new_inputs(const OutputVector& inputs) const {
|
||||
return std::make_shared<PerfCountRdtscEnd>(inputs.at(0));
|
||||
}
|
||||
|
||||
std::shared_ptr<PerfCountRdtscBegin> PerfCountRdtscEnd::get_pc_begin() {
|
||||
const auto& pc_begin = ov::as_type_ptr<PerfCountRdtscBegin>(get_input_source_output(get_input_size() - 1).get_node_shared_ptr());
|
||||
OPENVINO_ASSERT(pc_begin != nullptr, "PerfCountRdtscEnd last input is not connected to PerfCountRdtscBegin");
|
||||
return pc_begin;
|
||||
}
|
@ -0,0 +1,55 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "openvino/op/op.hpp"
|
||||
#include "snippets/op/perf_count.hpp"
|
||||
|
||||
using namespace ov::snippets::op;
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
/**
|
||||
* @interface PerfCountRdtscBegin
|
||||
* @brief Performance count start time via read rdtsc register
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class PerfCountRdtscBegin : public PerfCountBeginBase {
|
||||
public:
|
||||
OPENVINO_OP("PerfCountRdtscBegin", "SnippetsOpset", PerfCountBeginBase);
|
||||
PerfCountRdtscBegin();
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override;
|
||||
|
||||
uint64_t start_count = 0ul;
|
||||
};
|
||||
|
||||
/**
|
||||
* @interface PerfCountRdtscEnd
|
||||
* @brief Performance count end time and duration
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class PerfCountRdtscEnd : public PerfCountEndBase {
|
||||
public:
|
||||
OPENVINO_OP("PerfCountRdtscEnd", "SnippetsOpset", PerfCountEndBase);
|
||||
PerfCountRdtscEnd(const Output<Node>& pc_begin);
|
||||
PerfCountRdtscEnd() = default;
|
||||
~PerfCountRdtscEnd() {
|
||||
uint64_t avg = iteration == 0 ? 0 : accumulation / iteration;
|
||||
std::cout << "accumulation:" << accumulation << " iteration:" << iteration << " avg:" << avg << std::endl;
|
||||
}
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override;
|
||||
|
||||
std::shared_ptr<PerfCountRdtscBegin> get_pc_begin();
|
||||
// in each call, PerfCountRdtscBegin get start_count.
|
||||
// in each call, PerfCountRdtscEnd get end_count, then total_duration += end_count - start_count, and iteration++.
|
||||
// in destructor of PerfCountRdtscEnd, output the perf info
|
||||
// accumulation is cycle count
|
||||
uint64_t accumulation = 0ul;
|
||||
uint32_t iteration = 0u;
|
||||
};
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
@ -9,6 +9,7 @@
|
||||
#include "op/fused_mul_add.hpp"
|
||||
#include "op/load_convert.hpp"
|
||||
#include "op/store_convert.hpp"
|
||||
#include "op/perf_count_rdtsc.hpp"
|
||||
#include "transformations/cpu_opset/common/op/swish_cpu.hpp"
|
||||
|
||||
namespace ov {
|
||||
@ -38,6 +39,8 @@ const CPUShapeInferSnippetsFactory::TRegistry CPUShapeInferSnippetsFactory::spec
|
||||
SHAPE_INFER_PREDEFINED(ov::intel_cpu::LoadConvertTruncation, PassThroughShapeInfer),
|
||||
SHAPE_INFER_PREDEFINED(ov::intel_cpu::StoreConvertSaturation, PassThroughShapeInfer),
|
||||
SHAPE_INFER_PREDEFINED(ov::intel_cpu::StoreConvertTruncation, PassThroughShapeInfer),
|
||||
SHAPE_INFER_PREDEFINED(ov::intel_cpu::PerfCountRdtscBegin, EmptyShapeInfer),
|
||||
SHAPE_INFER_PREDEFINED(ov::intel_cpu::PerfCountRdtscEnd, EmptyShapeInfer),
|
||||
SHAPE_INFER_OP_SPECIFIC_EXTERNAL(ov::intel_cpu::BrgemmCPU, BrgemmShapeInfer),
|
||||
//
|
||||
SHAPE_INFER_OP_SPECIFIC(ov::intel_cpu::BrgemmCopyB),
|
||||
|
Loading…
Reference in New Issue
Block a user