[CPU][ARM] Perf fixes for FP16 precision (#18973)
This commit is contained in:
parent
4e96b6ba9d
commit
f09d2e2666
@ -19,6 +19,7 @@
|
||||
#include "graph_optimizer.h"
|
||||
#include "dnnl_extension_utils.h"
|
||||
#include "extension_mngr.h"
|
||||
#include "ie_ngraph_utils.hpp"
|
||||
#include "memory_solver.hpp"
|
||||
#include "itt.h"
|
||||
#include "infer_request.h"
|
||||
@ -198,8 +199,8 @@ void Graph::Replicate(const std::shared_ptr<const ov::Model> &subgraph) {
|
||||
void Graph::Replicate(const CNNNetwork &network) {
|
||||
OV_ITT_SCOPE_CHAIN(FIRST_INFERENCE, taskChain, itt::domains::intel_cpu_LT, "Graph::Replicate", "CNNNetwork");
|
||||
|
||||
InputsDataMap inputsInfo = network.getInputsInfo();
|
||||
OutputsDataMap outputsInfo = network.getOutputsInfo();
|
||||
const InputsDataMap& inputsInfo = network.getInputsInfo();
|
||||
const OutputsDataMap& outputsInfo = network.getOutputsInfo();
|
||||
|
||||
this->_name = network.getName();
|
||||
|
||||
@ -290,8 +291,6 @@ void Graph::Replicate(const CNNNetwork &network) {
|
||||
graphNodes.push_back(outNode);
|
||||
}
|
||||
|
||||
EnforceInferencePrecision();
|
||||
|
||||
auto hasSubgraphConsumers = [] (const NodePtr& node) -> bool {
|
||||
const auto & childEdges = node->getChildEdges();
|
||||
return std::any_of(childEdges.begin(), childEdges.end(),
|
||||
@ -302,13 +301,23 @@ void Graph::Replicate(const CNNNetwork &network) {
|
||||
return edgePtr->getChild()->getType() == Type::Subgraph;
|
||||
});
|
||||
};
|
||||
|
||||
// change precision for input/output nodes to avoid extra data conversion when set input/output blobs
|
||||
// also we need to change input/output precisions for consumers/producers to avoid inserting reorder
|
||||
for (auto &input : inputNodesMap) {
|
||||
const auto precToSet = normalizeToSupportedPrecision(inputsInfo.at(input.first)->getPrecision());
|
||||
input.second->setOriginalOutputPrecisionAtPort(0, precToSet);
|
||||
const auto childEdges = input.second->getChildEdgesAtPort(0);
|
||||
}
|
||||
|
||||
for (auto &output : outputNodesMap) {
|
||||
const auto precToSet = normalizeToSupportedPrecision(outputsInfo.at(output.first)->getPrecision());
|
||||
output.second->setOriginalInputPrecisionAtPort(0, precToSet);
|
||||
}
|
||||
// enforce must be performed after inputs and outputs info are taken into account
|
||||
EnforceInferencePrecision();
|
||||
// also we need to change input/output precisions for consumers/producers to avoid inserting reorder
|
||||
for (auto &input : inputNodesMap) {
|
||||
const auto& inputNode = input.second;
|
||||
const auto precToSet = inputNode->getOriginalOutputPrecisionAtPort(0);
|
||||
const auto childEdges = inputNode->getChildEdgesAtPort(0);
|
||||
for (size_t i = 0; i < childEdges.size(); i++) {
|
||||
const auto child = childEdges[i]->getChild();
|
||||
const auto child_prec = child->getOriginalInputPrecisionAtPort(childEdges[i]->getOutputNum());
|
||||
@ -320,9 +329,9 @@ void Graph::Replicate(const CNNNetwork &network) {
|
||||
}
|
||||
|
||||
for (auto &output : outputNodesMap) {
|
||||
const auto precToSet = normalizeToSupportedPrecision(outputsInfo.at(output.first)->getPrecision());
|
||||
output.second->setOriginalInputPrecisionAtPort(0, precToSet);
|
||||
const auto parentEdges = output.second->getParentEdgesAtPort(0);
|
||||
const auto& outputNode = output.second;
|
||||
const auto precToSet = outputNode->getOriginalInputPrecisionAtPort(0);
|
||||
const auto parentEdges = outputNode->getParentEdgesAtPort(0);
|
||||
for (size_t i = 0; i < parentEdges.size(); i++) {
|
||||
const auto parent = parentEdges[i]->getParent();
|
||||
parent->setOriginalOutputPrecisionAtPort(parentEdges[i]->getInputNum(), precToSet);
|
||||
@ -337,7 +346,7 @@ void Graph::Replicate(const CNNNetwork &network) {
|
||||
} else {
|
||||
outShape = inputNodesMap[input.first]->outputShapes.front();
|
||||
}
|
||||
InputInfo::Ptr ii = inputsInfo[input.first];
|
||||
InputInfo::Ptr ii = input.second;
|
||||
if (ii && ii->getPreProcess().getNumberOfChannels()) {
|
||||
_normalizePreprocMap[input.first].Load(outShape, ii);
|
||||
}
|
||||
@ -1685,21 +1694,14 @@ bool Graph::InsertNode(NodePtr parent, NodePtr child, NodePtr node, int parentPo
|
||||
return true;
|
||||
}
|
||||
|
||||
// Set all non const data paths precision to BF16
|
||||
// Apply inference precision configuration
|
||||
void Graph::EnforceInferencePrecision() {
|
||||
CPU_DEBUG_CAP_ENABLE(static EnforceInferPrcDebug inferPrecDebug);
|
||||
auto inferPrec = InferenceEngine::Precision::FP32;
|
||||
switch (getConfig().inferencePrecision) {
|
||||
case ov::element::bf16:
|
||||
inferPrec = InferenceEngine::Precision::BF16;
|
||||
break;
|
||||
case ov::element::f16:
|
||||
inferPrec = InferenceEngine::Precision::FP16;
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
break;
|
||||
}
|
||||
|
||||
const auto inferPrec = convertPrecision(getConfig().inferencePrecision);
|
||||
|
||||
if (inferPrec == Precision::FP32)
|
||||
return; // nothing to do, only precision reduction is currently allowed
|
||||
|
||||
std::function<void(const NodePtr&, std::unordered_set<NodePtr>& skipNodes)> searchForNodesToSkip;
|
||||
searchForNodesToSkip = [&](const NodePtr& node, std::unordered_set<NodePtr>& skipNodes) -> void {
|
||||
@ -1743,46 +1745,62 @@ void Graph::EnforceInferencePrecision() {
|
||||
std::unordered_set<NodePtr> nodesToSkip;
|
||||
// starting from output nodes
|
||||
for (const auto& entry : outputNodesMap) {
|
||||
const auto& node = entry.second;
|
||||
if (node->getOriginalInputPrecisionAtPort(0) == Precision::BF16)
|
||||
const auto& output = entry.second;
|
||||
// do not skip outputs which precisions are explicitly set equal to inferPrec
|
||||
if (output->getOriginalInputPrecisionAtPort(0) == inferPrec)
|
||||
continue;
|
||||
searchForNodesToSkip(node, nodesToSkip);
|
||||
|
||||
searchForNodesToSkip(output, nodesToSkip);
|
||||
}
|
||||
|
||||
for (const auto& node : graphNodes) {
|
||||
if (nodesToSkip.count(node) && !node->enforceBF16evenForGraphTail)
|
||||
continue;
|
||||
|
||||
if (node->getType() != Type::Input && node->getType() != Type::Output) {
|
||||
if (one_of(node->getType(), Type::Input, Type::Output))
|
||||
continue;
|
||||
|
||||
#ifdef CPU_DEBUG_CAPS
|
||||
if (!inferPrecDebug.enabled(NameFromType(node->getType()), node->getName()))
|
||||
continue;
|
||||
#endif
|
||||
|
||||
DEBUG_LOG("#", node->getExecIndex(),
|
||||
" ", node->getName(),
|
||||
" is enforced to use", inferPrec);
|
||||
DEBUG_LOG("#", node->getExecIndex(), " ", node->getName(), " is enforced to use", inferPrec);
|
||||
|
||||
for (size_t i = 0; i < node->getOriginalInputsNumber(); i++) {
|
||||
const auto &parent = node->getParentEdgesAtPort(i)[0]->getParent();
|
||||
auto keepOriginalInputPrecisionAtPort = [](const NodePtr& node, const size_t inPort) {
|
||||
// keep non-float precisions
|
||||
if (node->getOriginalInputPrecisionAtPort(inPort) != Precision::FP32)
|
||||
return true;
|
||||
|
||||
const auto &parent = node->getParentEdgesAtPort(inPort)[0]->getParent();
|
||||
/* Skip BF16 enforcement for nodes after Constant Inputs for maintaining precision for fusing.
|
||||
* Precision conversion to BF16 does automatically, if convolution follows up after Constant Inputs
|
||||
* and if activation is BF16 */
|
||||
if (!(parent->getType() == Type::Input && parent->isConstant() &&
|
||||
* Precision conversion to BF16 is done automatically, if convolution follows up after Constant Inputs
|
||||
* and activation is BF16 */
|
||||
if (parent->getType() == Type::Input && parent->isConstant() &&
|
||||
// Concatenation node is exception because it doesn't change an accuracy for BF16 activation
|
||||
node->getType() != Type::Concatenation) &&
|
||||
// exclude Eltwise after Input since it supports conversion to BF16
|
||||
!(parent->getType() == Type::Input && (node->getType() == Type::Eltwise || node->getType() == Type::Subgraph)) &&
|
||||
node->getOriginalInputPrecisionAtPort(i) == Precision::FP32)
|
||||
node->getType() != Type::Concatenation)
|
||||
return true;
|
||||
// Eltwise and Subgraph (snippets) nodes support precision conversion
|
||||
if (parent->getType() == Type::Input && one_of(node->getType(), Type::Eltwise, Type::Subgraph))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
};
|
||||
|
||||
if (keepOriginalInputPrecisionAtPort(node, i))
|
||||
continue;
|
||||
|
||||
node->setOriginalInputPrecisionAtPort(i, inferPrec);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < node->getOriginalOutputsNumber(); i++) {
|
||||
if (node->getOriginalOutputPrecisionAtPort(i) == Precision::FP32)
|
||||
// keep non-float precisions
|
||||
if (node->getOriginalOutputPrecisionAtPort(i) != Precision::FP32)
|
||||
continue;
|
||||
|
||||
node->setOriginalOutputPrecisionAtPort(i, inferPrec);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Model> Graph::dump() const {
|
||||
|
@ -5,6 +5,7 @@
|
||||
|
||||
#include "eltwise.h"
|
||||
|
||||
#include <common/float16.hpp>
|
||||
#include <map>
|
||||
#include <set>
|
||||
|
||||
@ -1537,6 +1538,13 @@ public:
|
||||
static const int optimalTensorRank = 6;
|
||||
};
|
||||
|
||||
/* enabled only for float at float16_t at the moment
|
||||
* can be extended in the future */
|
||||
template<typename T,
|
||||
typename std::enable_if<
|
||||
std::is_same<T, float>::value ||
|
||||
std::is_same<T, dnnl::impl::float16_t>::value>
|
||||
::type* = nullptr>
|
||||
class EltwiseRefExecutor : public Eltwise::IEltwiseExecutor {
|
||||
public:
|
||||
EltwiseRefExecutor(Eltwise::EltwiseData opData,
|
||||
@ -1571,30 +1579,30 @@ public:
|
||||
_dst_offsets.resize(input_size, 1);
|
||||
EltwiseJitExecutor::offset_out_calc(_dst_offsets, _dims);
|
||||
for (size_t j = 0; j < input_size; j++) {
|
||||
_dst_offsets[j] *= sizeof(float); // only FP32 out prc is supported
|
||||
_dst_offsets[j] *= sizeof(T);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < _inputNum; i++) {
|
||||
_src_offsets[i].resize(input_size, 1);
|
||||
EltwiseJitExecutor::offset_in_calc(_src_offsets[i], inpDims[i], _dims);
|
||||
for (size_t j = 0; j < input_size; j++) {
|
||||
_src_offsets[i][j] *= sizeof(float); // only FP32 inp prcs are supported
|
||||
_src_offsets[i][j] *= sizeof(T);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void exec(const jit_eltwise_call_args_ptrs &args_ptrs, const VectorDims &dims_out) override {
|
||||
if (_opData.algo == Algorithm::EltwiseLog) {
|
||||
const float* src_ptr_f = reinterpret_cast<const float*>(args_ptrs.src_ptr[0]);
|
||||
float* dst_ptr_f = reinterpret_cast<float*>(args_ptrs.dst_ptr);
|
||||
const T* src_ptr_f = reinterpret_cast<const T*>(args_ptrs.src_ptr[0]);
|
||||
T* dst_ptr_f = reinterpret_cast<T*>(args_ptrs.dst_ptr);
|
||||
parallel_for(_fullWorkAmount, [&](size_t i) {
|
||||
dst_ptr_f[i] = logf(src_ptr_f[i]);
|
||||
});
|
||||
return;
|
||||
}
|
||||
if (_opData.algo == Algorithm::EltwisePowerStatic) {
|
||||
const float* src_ptr_f = reinterpret_cast<const float*>(args_ptrs.src_ptr[0]);
|
||||
float* dst_ptr_f = reinterpret_cast<float*>(args_ptrs.dst_ptr);
|
||||
const T* src_ptr_f = reinterpret_cast<const T*>(args_ptrs.src_ptr[0]);
|
||||
T* dst_ptr_f = reinterpret_cast<T*>(args_ptrs.dst_ptr);
|
||||
if (_opData.alpha == 2) {
|
||||
parallel_for(_fullWorkAmount, [&](size_t i) {
|
||||
dst_ptr_f[i] = (_opData.beta * src_ptr_f[i] + _opData.gamma) *
|
||||
@ -1608,9 +1616,9 @@ public:
|
||||
return;
|
||||
}
|
||||
if (_opData.algo == Algorithm::EltwisePowerDynamic) {
|
||||
const float* src_ptr_f = reinterpret_cast<const float*>(args_ptrs.src_ptr[0]);
|
||||
const float* src_ptr_f_pow = reinterpret_cast<const float*>(args_ptrs.src_ptr[1]);
|
||||
float* dst_ptr_f = reinterpret_cast<float*>(args_ptrs.dst_ptr);
|
||||
const T* src_ptr_f = reinterpret_cast<const T*>(args_ptrs.src_ptr[0]);
|
||||
const T* src_ptr_f_pow = reinterpret_cast<const T*>(args_ptrs.src_ptr[1]);
|
||||
T* dst_ptr_f = reinterpret_cast<T*>(args_ptrs.dst_ptr);
|
||||
|
||||
uint32_t count_of_power_values = 1;
|
||||
for (unsigned long i : _inpDims[1]) {
|
||||
@ -1656,20 +1664,20 @@ public:
|
||||
for (size_t j = 0; j < counters.size(); j++) {
|
||||
index_in[i] += counters[j] * _src_offsets[i][j];
|
||||
}
|
||||
index_in[i] /= sizeof(float);
|
||||
index_in[i] /= sizeof(T);
|
||||
}
|
||||
|
||||
size_t index_out = 0;
|
||||
for (size_t j = 0; j < counters.size(); j++) {
|
||||
index_out += counters[j] * _dst_offsets[j];
|
||||
}
|
||||
index_out /= sizeof(float);
|
||||
index_out /= sizeof(T);
|
||||
|
||||
std::vector<float> src_f(_inputNum);
|
||||
std::vector<T> src_f(_inputNum);
|
||||
for (size_t i = 0; i < _inputNum; i++) {
|
||||
src_f[i] = (reinterpret_cast<const float*>(args_ptrs.src_ptr[i]) + index_in[i])[0];
|
||||
src_f[i] = (reinterpret_cast<const T*>(args_ptrs.src_ptr[i]) + index_in[i])[0];
|
||||
}
|
||||
float* dst_ptr_f = reinterpret_cast<float*>(args_ptrs.dst_ptr) + index_out;
|
||||
T* dst_ptr_f = reinterpret_cast<T*>(args_ptrs.dst_ptr) + index_out;
|
||||
|
||||
switch (_opData.algo) {
|
||||
case Algorithm::EltwiseRelu:
|
||||
@ -1712,13 +1720,14 @@ public:
|
||||
case Algorithm::EltwiseLogicalOr: *dst_ptr_f = src_f[0] || src_f[1]; break;
|
||||
case Algorithm::EltwiseLogicalXor: *dst_ptr_f = (src_f[0] || src_f[1]) - (src_f[0] && src_f[1]); break;
|
||||
case Algorithm::EltwiseLogicalNot: *dst_ptr_f = !src_f[0]; break;
|
||||
case Algorithm::EltwisePrelu: *dst_ptr_f = src_f[0] > 0 ? src_f[0] : src_f[0] * src_f[1]; break;
|
||||
case Algorithm::EltwisePrelu: *dst_ptr_f = src_f[0] > 0 ? src_f[0] : static_cast<T>(src_f[0] * src_f[1]); break;
|
||||
case Algorithm::EltwiseErf: *dst_ptr_f = std::erf(src_f[0]); break;
|
||||
case Algorithm::EltwiseSoftSign: *dst_ptr_f = src_f[0] / (1 + std::fabs(src_f[0])); break;
|
||||
case Algorithm::EltwiseIsFinite: *dst_ptr_f = std::isfinite(src_f[0]); break;
|
||||
// @todo implement proper isinfinite for non-float precisions
|
||||
case Algorithm::EltwiseIsFinite: *dst_ptr_f = std::isfinite(static_cast<float>(src_f[0])); break;
|
||||
case Algorithm::EltwiseIsInf:
|
||||
*dst_ptr_f = (_opData.alpha && (src_f[0] == -std::numeric_limits<float>::infinity())) ||
|
||||
(_opData.beta && (src_f[0] == std::numeric_limits<float>::infinity()));
|
||||
*dst_ptr_f = (_opData.alpha && (src_f[0] == -std::numeric_limits<T>::infinity())) ||
|
||||
(_opData.beta && (src_f[0] == std::numeric_limits<T>::infinity()));
|
||||
break;
|
||||
case Algorithm::EltwiseIsNaN: *dst_ptr_f = std::isnan(src_f[0]); break;
|
||||
case Algorithm::EltwiseSelect: *dst_ptr_f = src_f[0] ? src_f[1] : src_f[2]; break;
|
||||
@ -1757,10 +1766,24 @@ bool Eltwise::EltwiseData::operator==(const EltwiseData &rhs) const noexcept {
|
||||
gamma == rhs.gamma;
|
||||
}
|
||||
|
||||
static Eltwise::executorPtr buildRefExecutor(const EltwiseKey& key) {
|
||||
if (key.outPrc == Precision::FP16) {
|
||||
return std::make_shared<EltwiseRefExecutor<dnnl::impl::float16_t>>(key.eltwise_data.front(),
|
||||
key.outBlkDims,
|
||||
key.inpDims);
|
||||
}
|
||||
// use float reference executor for any other precision for now
|
||||
return std::make_shared<EltwiseRefExecutor<float>>(key.eltwise_data.front(),
|
||||
key.outBlkDims,
|
||||
key.inpDims);
|
||||
}
|
||||
|
||||
static Eltwise::executorPtr buildExecutor(const EltwiseKey& key) {
|
||||
Eltwise::executorPtr execPtr;
|
||||
if (key.implType != EltwiseImplType::reference) {
|
||||
execPtr = std::make_shared<EltwiseJitExecutor>(key.eltwise_data,
|
||||
if (key.implType == EltwiseImplType::reference) {
|
||||
return buildRefExecutor(key);
|
||||
}
|
||||
|
||||
return std::make_shared<EltwiseJitExecutor>(key.eltwise_data,
|
||||
key.ops_list,
|
||||
key.outBlkDims,
|
||||
key.outOrder,
|
||||
@ -1769,12 +1792,6 @@ static Eltwise::executorPtr buildExecutor(const EltwiseKey& key) {
|
||||
key.outPrc,
|
||||
key.postOps,
|
||||
key.implType == EltwiseImplType::optimizedShapeAgnostic);
|
||||
} else {
|
||||
execPtr = std::make_shared<EltwiseRefExecutor>(key.eltwise_data.front(),
|
||||
key.outBlkDims,
|
||||
key.inpDims);
|
||||
}
|
||||
return execPtr;
|
||||
}
|
||||
|
||||
bool Eltwise::isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
|
||||
@ -1965,8 +1982,9 @@ void Eltwise::initSupportedPrimitiveDescriptors() {
|
||||
}
|
||||
|
||||
#if defined(OV_CPU_WITH_ACL)
|
||||
Precision forcedPrec;
|
||||
//ACL implementation supports only identical precisions on inputs/outputs so they are aligned it to highest one
|
||||
// Use original output precision as a reference point since some eltwise algorithms have non-float inputs (i.e. EltwiseSelect)
|
||||
Precision forcedPrec = getOriginalOutputPrecisionAtPort(0) == Precision::FP16 ? Precision::FP16 : Precision::FP32;
|
||||
// ACL implementation supports only identical precisions on inputs/outputs so they are aligned it to highest one
|
||||
if (AclEltwiseExecutor::isEltwiseAlgorithmSupported(getAlgorithm())) {
|
||||
for (size_t i = 0; i < getParentEdges().size(); i++) {
|
||||
if (!getParentEdgeAt(i)->getParent()->isConstant()) {
|
||||
@ -1978,9 +1996,8 @@ void Eltwise::initSupportedPrimitiveDescriptors() {
|
||||
if (!forcedPrec.is_float()) {
|
||||
forcedPrec = Precision::FP32;
|
||||
}
|
||||
} else {
|
||||
forcedPrec = Precision::FP32;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < inputPrecisions.size(); i++) {
|
||||
inputPrecisions[i] = forcedPrec;
|
||||
}
|
||||
|
@ -217,12 +217,14 @@ void FullyConnected::getSupportedDescriptors() {
|
||||
outputDataType = memory::data_type::bf16;
|
||||
}
|
||||
} else if (inputDataType == memory::data_type::f16) {
|
||||
#if defined(OV_CPU_WITH_ACL)
|
||||
// acl fc does not support precisions conversion
|
||||
outputDataType = weightsDataType = memory::data_type::f16;
|
||||
#else
|
||||
// f16 input only supports f16/f32 output, even if FQ is fused as post-ops
|
||||
if (!one_of(outputDataType , memory::data_type::f32, memory::data_type::f16)) {
|
||||
outputDataType = memory::data_type::f16;
|
||||
}
|
||||
#if defined(OV_CPU_WITH_ACL)
|
||||
weightsDataType = memory::data_type::f16;
|
||||
#endif
|
||||
} else if (one_of(inputDataType, memory::data_type::u8, memory::data_type::s8)) {
|
||||
if (weightsDataType != memory::data_type::s8) {
|
||||
|
@ -256,7 +256,7 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
|
||||
{ov::element::i4, ov::element::i8},
|
||||
{ov::element::u4, ov::element::u8}
|
||||
};
|
||||
|
||||
// @todo should we always convert to f32 regardless of hardware support, as it is done for f16?
|
||||
if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core))
|
||||
map.insert({ov::element::bf16, ov::element::f32});
|
||||
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include "ie_common.h"
|
||||
#include "ie_layouts.h"
|
||||
#include "general_utils.h"
|
||||
#include "precision_support.h"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
@ -98,10 +99,14 @@ inline bool isEmptyTensorDesc(const InferenceEngine::TensorDesc &td) {
|
||||
*/
|
||||
inline InferenceEngine::Precision normalizeToSupportedPrecision(InferenceEngine::Precision precision) {
|
||||
switch (precision) {
|
||||
case InferenceEngine::Precision::BF16:
|
||||
case InferenceEngine::Precision::FP16: {
|
||||
if (!hasHardwareSupport(precision))
|
||||
precision = InferenceEngine::Precision::FP32;
|
||||
}
|
||||
case InferenceEngine::Precision::U8:
|
||||
case InferenceEngine::Precision::I8:
|
||||
case InferenceEngine::Precision::I32:
|
||||
case InferenceEngine::Precision::BF16:
|
||||
case InferenceEngine::Precision::FP32: {
|
||||
break;
|
||||
}
|
||||
@ -121,14 +126,11 @@ inline InferenceEngine::Precision normalizeToSupportedPrecision(InferenceEngine:
|
||||
precision = InferenceEngine::Precision::I32;
|
||||
break;
|
||||
}
|
||||
case InferenceEngine::Precision::FP16: {
|
||||
precision = InferenceEngine::Precision::FP32;
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
precision = InferenceEngine::Precision::UNSPECIFIED;
|
||||
}
|
||||
}
|
||||
|
||||
return precision;
|
||||
}
|
||||
|
||||
@ -161,6 +163,5 @@ inline std::vector<float> makeAlignedBuffer(size_t targetSize, const std::vector
|
||||
}
|
||||
return alignedBuffer;
|
||||
}
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
||||
|
42
src/plugins/intel_cpu/src/utils/precision_support.cpp
Normal file
42
src/plugins/intel_cpu/src/utils/precision_support.cpp
Normal file
@ -0,0 +1,42 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "precision_support.h"
|
||||
|
||||
#include "ie_precision.hpp"
|
||||
#include "cpu/x64/cpu_isa_traits.hpp"
|
||||
#include "openvino/core/visibility.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
bool hasHardwareSupport(const InferenceEngine::Precision& precision) {
|
||||
switch (precision) {
|
||||
case InferenceEngine::Precision::FP16: {
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_fp16))
|
||||
return true;
|
||||
return false;
|
||||
#elif defined(OV_CPU_ARM_ENABLE_FP16)
|
||||
return true; // @todo add runtime check for arm as well
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
case InferenceEngine::Precision::BF16: {
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core))
|
||||
return true;
|
||||
return false;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
15
src/plugins/intel_cpu/src/utils/precision_support.h
Normal file
15
src/plugins/intel_cpu/src/utils/precision_support.h
Normal file
@ -0,0 +1,15 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ie_precision.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
bool hasHardwareSupport(const InferenceEngine::Precision& precision);
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
@ -88,12 +88,12 @@ std::vector<std::string> disabledTestPatterns() {
|
||||
R"(.*Hetero.*Behavior.*ExecutableNetworkBaseTest.*ExecGraphInfo.*)",
|
||||
R"(.*Hetero.*Behavior.*OVCompiledModelBaseTest.*ExecGraphInfo.*)",
|
||||
R"(.*Hetero.*Behavior.*ExecutableNetworkBaseTest.*CanCreateTwoExeNetworksAndCheckFunction.*)",
|
||||
// TODO: CVS-104942
|
||||
// TODO: 104942
|
||||
R"(.*(Auto|Multi).*Behavior.*ExecutableNetworkBaseTest.*canLoadCorrectNetworkToGetExecutableAndCheckConfig.*)",
|
||||
R"(.*(Auto|Multi).*SetPropLoadNetWorkGetPropTests.*)",
|
||||
R"(.*Hetero.*Behavior.*OVCompiledModelBaseTest.*canCreateTwoCompiledModelAndCheckTheir.*)",
|
||||
// CPU does not support dynamic rank
|
||||
// Issue: CVS-66778
|
||||
// Issue: 66778
|
||||
R"(.*smoke_BehaviorTests.*InferFullyDynamicNetworkWith(S|G)etTensor.*)",
|
||||
R"(.*smoke_Hetero_BehaviorTests.*InferFullyDynamicNetworkWith(S|G)etTensor.*)",
|
||||
R"(.*smoke_Auto_BehaviorTests.*InferFullyDynamicNetworkWith(S|G)etTensor.*)",
|
||||
@ -111,7 +111,7 @@ std::vector<std::string> disabledTestPatterns() {
|
||||
|
||||
// Issue 67214
|
||||
R"(smoke_PrePostProcess.*resize_and_convert_layout_i8.*)",
|
||||
// TODO: CVS-67255
|
||||
// TODO: 67255
|
||||
R"(smoke_If.*SimpleIf2OutTest.*)",
|
||||
|
||||
// Issue: 69086
|
||||
@ -188,7 +188,7 @@ std::vector<std::string> disabledTestPatterns() {
|
||||
// New plugin API doesn't support changes of pre-processing
|
||||
R"(.*(Auto|Multi|Hetero).*InferRequestPreprocessTest.*SetPreProcessToInputInfo.*)",
|
||||
R"(.*(Auto|Multi|Hetero).*InferRequestPreprocessTest.*SetPreProcessToInferRequest.*)",
|
||||
// TODO: for 22.2 (CVS-68949)
|
||||
// TODO: for 22.2 (Issue 68949)
|
||||
R"(.*smoke_AutoBatching_CPU/AutoBatching_Test_DetectionOutput.*)",
|
||||
};
|
||||
|
||||
@ -212,7 +212,7 @@ std::vector<std::string> disabledTestPatterns() {
|
||||
retVector.emplace_back(R"(smoke_NegativeQuantizedMatMulMultiplyFusion.*)");
|
||||
// int8 specific
|
||||
retVector.emplace_back(R"(smoke_Quantized.*)");
|
||||
// TODO: fix CVS-115961
|
||||
// TODO: Issue 115961
|
||||
retVector.emplace_back(R"(.*compareAutoBatchingToSingleBatch/CPU_get_blob_batch_size_4_num_streams_1_num_req_64*)");
|
||||
retVector.emplace_back(R"(.*compareAutoBatchingToSingleBatch/CPU_get_blob_batch_size_4_num_streams_2_num_req_64*)");
|
||||
retVector.emplace_back(R"(.*compareAutoBatchingToSingleBatch/CPU_set_blob_batch_size_4_num_streams_1_num_req_64*)");
|
||||
@ -230,9 +230,9 @@ std::vector<std::string> disabledTestPatterns() {
|
||||
// TODO: generate new 'expected' runtime graph for non-x64 CPU
|
||||
retVector.emplace_back(R"(smoke_serialization/ExecGraphSerializationTest.ExecutionGraph.*)");
|
||||
retVector.emplace_back(R"(smoke_ExecGraph/ExecGraphRuntimePrecision.CheckRuntimePrecision/Function=(EltwiseWithTwoDynamicInputs|FakeQuantizeRelu).*)");
|
||||
// CVS-108803: bug in CPU scalar implementation
|
||||
// Issue 108803: bug in CPU scalar implementation
|
||||
retVector.emplace_back(R"(smoke_TestsDFT_(1|2|3|4)d/DFTLayerTest.CompareWithRefs.*)");
|
||||
// CVS-88764, CVS-91647, CVS-108802: accuracy issue
|
||||
// Issue 88764, 91647, 108802: accuracy issue
|
||||
retVector.emplace_back(R"(MultipleLSTMCellTest/MultipleLSTMCellTest.CompareWithRefs.*)");
|
||||
// int8 / code-generation specific
|
||||
retVector.emplace_back(R"(smoke_LPT.*)");
|
||||
@ -258,10 +258,14 @@ std::vector<std::string> disabledTestPatterns() {
|
||||
// Skip fp16 tests for paltforms that don't support fp16 precision
|
||||
retVector.emplace_back(R"(.*INFERENCE_PRECISION_HINT=(F|f)16.*)");
|
||||
}
|
||||
#endif
|
||||
#if defined(OV_CPU_ARM_ENABLE_FP16)
|
||||
#elif defined(OPENVINO_ARCH_ARM64) || defined(OPENVINO_ARCH_ARM)
|
||||
#if !defined(OV_CPU_ARM_ENABLE_FP16)
|
||||
// Skip fp16 tests for paltforms that don't support fp16 precision
|
||||
retVector.emplace_back(R"(.*INFERENCE_PRECISION_HINT=(F|f)16.*)");
|
||||
#else
|
||||
// Issue 117407
|
||||
retVector.emplace_back(R"(.*EltwiseLayerCPUTest.*IS=\(\[1\.\.10\.2\.5\.6\]_\).*eltwiseOpType=SqDiff.*_configItem=INFERENCE_PRECISION_HINT=f16.*)");
|
||||
#endif // OV_CPU_ARM_ENABLE_FP16
|
||||
#endif
|
||||
if (!InferenceEngine::with_cpu_x86_avx512_core_vnni() && !InferenceEngine::with_cpu_x86_avx512_core_amx_int8()) {
|
||||
// MatMul in Snippets uses BRGEMM that supports i8 only on platforms with VNNI or AMX instructions
|
||||
|
@ -4,6 +4,8 @@
|
||||
|
||||
#include "eltwise.hpp"
|
||||
#include "gtest/gtest.h"
|
||||
#include "openvino/core/type/element_type.hpp"
|
||||
#include "openvino/runtime/properties.hpp"
|
||||
#include "test_utils/cpu_test_utils.hpp"
|
||||
|
||||
using namespace InferenceEngine;
|
||||
@ -61,7 +63,7 @@ ov::Tensor EltwiseLayerCPUTest::generate_eltwise_input(const ov::element::Type&
|
||||
params = gen_params(INT32_MAX, INT32_MIN);
|
||||
}
|
||||
return ov::test::utils::create_and_fill_tensor(type, shape, params.range, params.start_from, params.resolution);
|
||||
}
|
||||
}
|
||||
|
||||
void EltwiseLayerCPUTest::generate_inputs(const std::vector<ngraph::Shape>& targetInputStaticShapes) {
|
||||
inputs.clear();
|
||||
@ -81,8 +83,13 @@ void EltwiseLayerCPUTest::SetUp() {
|
||||
ElementType netType;
|
||||
ngraph::helpers::InputLayerType secondaryInputType;
|
||||
ov::test::utils::OpType opType;
|
||||
ov::AnyMap additional_config;
|
||||
std::tie(shapes, eltwiseType, secondaryInputType, opType, netType, inType, outType, targetDevice, additional_config) = basicParamsSet;
|
||||
ov::AnyMap additionalConfig;
|
||||
std::tie(shapes, eltwiseType, secondaryInputType, opType, netType, inType, outType, targetDevice, additionalConfig) = basicParamsSet;
|
||||
// we have to change model precision as well, otherwise inference precision won't affect single-node graph
|
||||
// due to enforce inference precision optimization for the eltwise as first node of the model
|
||||
if (ov::element::Type(netType).is_real() && additionalConfig.count(ov::hint::inference_precision.name())) {
|
||||
netType = additionalConfig[ov::hint::inference_precision.name()].as<ov::element::Type>();
|
||||
}
|
||||
|
||||
if (ElementType::bf16 == netType) {
|
||||
rel_threshold = 2e-2f;
|
||||
@ -93,13 +100,6 @@ void EltwiseLayerCPUTest::SetUp() {
|
||||
std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
|
||||
std::tie(postOpMgrPtr, fusedOps) = fusingParams;
|
||||
|
||||
selectedType = makeSelectedTypeStr(getPrimitiveType(), netType);
|
||||
#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
|
||||
if (eltwiseType == POWER) {
|
||||
selectedType = std::regex_replace(selectedType, std::regex("acl"), "ref");
|
||||
}
|
||||
#endif
|
||||
|
||||
shapes.resize(2);
|
||||
switch (opType) {
|
||||
case ov::test::utils::OpType::SCALAR: {
|
||||
@ -117,8 +117,15 @@ void EltwiseLayerCPUTest::SetUp() {
|
||||
}
|
||||
|
||||
init_input_shapes(shapes);
|
||||
configuration.insert(additionalConfig.begin(), additionalConfig.end());
|
||||
updateSelectedType(getPrimitiveType(), netType, configuration);
|
||||
// selectedType = makeSelectedTypeStr(getPrimitiveType(), netType);
|
||||
#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
|
||||
if (eltwiseType == POWER) {
|
||||
selectedType = std::regex_replace(selectedType, std::regex("acl"), "ref");
|
||||
}
|
||||
#endif
|
||||
|
||||
configuration.insert(additional_config.begin(), additional_config.end());
|
||||
auto parameters = ngraph::builder::makeDynamicParams(netType, {inputDynamicShapes.front()});
|
||||
std::shared_ptr<ngraph::Node> secondaryInput;
|
||||
if (secondaryInputType == ngraph::helpers::InputLayerType::PARAMETER) {
|
||||
@ -161,11 +168,11 @@ TEST_P(EltwiseLayerCPUTest, CompareWithRefs) {
|
||||
|
||||
namespace Eltwise {
|
||||
const std::vector<ov::AnyMap>& additional_config() {
|
||||
static const std::vector<ov::AnyMap> additional_config = {
|
||||
static const std::vector<ov::AnyMap> additionalConfig = {
|
||||
{{ov::hint::inference_precision.name(), ov::element::f32}},
|
||||
{{ov::hint::inference_precision.name(), ov::element::f16}}
|
||||
};
|
||||
return additional_config;
|
||||
return additionalConfig;
|
||||
}
|
||||
|
||||
const std::vector<ElementType>& netType() {
|
||||
@ -185,11 +192,11 @@ const std::vector<ngraph::helpers::EltwiseTypes>& eltwiseOpTypesBinInp() {
|
||||
static const std::vector<ngraph::helpers::EltwiseTypes> eltwiseOpTypesBinInp = {
|
||||
ngraph::helpers::EltwiseTypes::ADD,
|
||||
ngraph::helpers::EltwiseTypes::MULTIPLY,
|
||||
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
|
||||
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
|
||||
ngraph::helpers::EltwiseTypes::SUBTRACT, // TODO: Fix CVS-105430
|
||||
ngraph::helpers::EltwiseTypes::DIVIDE, // TODO: Fix CVS-105430
|
||||
ngraph::helpers::EltwiseTypes::FLOOR_MOD, // TODO: Fix CVS-111875
|
||||
#endif
|
||||
#endif
|
||||
ngraph::helpers::EltwiseTypes::SQUARED_DIFF,
|
||||
};
|
||||
return eltwiseOpTypesBinInp;
|
||||
@ -207,9 +214,9 @@ const std::vector<ngraph::helpers::EltwiseTypes>& eltwiseOpTypesBinDyn() {
|
||||
static const std::vector<ngraph::helpers::EltwiseTypes> eltwiseOpTypesBinDyn = {
|
||||
ngraph::helpers::EltwiseTypes::ADD,
|
||||
ngraph::helpers::EltwiseTypes::MULTIPLY,
|
||||
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) // TODO: Fix CVS-105430
|
||||
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) // TODO: Fix CVS-105430
|
||||
ngraph::helpers::EltwiseTypes::SUBTRACT,
|
||||
#endif
|
||||
#endif
|
||||
ngraph::helpers::EltwiseTypes::SQUARED_DIFF,
|
||||
};
|
||||
return eltwiseOpTypesBinDyn;
|
||||
@ -255,10 +262,10 @@ const std::vector<ngraph::helpers::EltwiseTypes>& eltwiseOpTypesI32() {
|
||||
static const std::vector<ngraph::helpers::EltwiseTypes> eltwiseOpTypesI32 = {
|
||||
ngraph::helpers::EltwiseTypes::ADD,
|
||||
ngraph::helpers::EltwiseTypes::MULTIPLY,
|
||||
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) // TODO: Fix CVS-105430
|
||||
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) // TODO: Fix CVS-105430
|
||||
ngraph::helpers::EltwiseTypes::SUBTRACT,
|
||||
ngraph::helpers::EltwiseTypes::DIVIDE,
|
||||
#endif
|
||||
#endif
|
||||
ngraph::helpers::EltwiseTypes::SQUARED_DIFF,
|
||||
};
|
||||
return eltwiseOpTypesI32;
|
||||
@ -375,7 +382,7 @@ const std::vector<InputShape>& inShapes_5D_dyn_const() {
|
||||
{3, 2, 4, 11, 2},
|
||||
}
|
||||
},
|
||||
};
|
||||
};
|
||||
return inShapes_5D_dyn_const;
|
||||
}
|
||||
|
||||
|
@ -114,9 +114,7 @@ void MvnLayerCPUTest::SetUp() {
|
||||
rel_threshold = 250.f;
|
||||
}
|
||||
configuration.insert(additionalConfig.begin(), additionalConfig.end());
|
||||
|
||||
selectedType = getPrimitiveType();
|
||||
selectedType = makeSelectedTypeStr(selectedType, netPrecision);
|
||||
updateSelectedType(getPrimitiveType(), netPrecision, configuration);
|
||||
|
||||
function = makeNgraphFunction(netPrecision, param, mvn, "mvn");
|
||||
}
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include "reduce.hpp"
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "shared_test_classes/base/ov_subgraph.hpp"
|
||||
#include "test_utils/cpu_test_utils.hpp"
|
||||
|
||||
using namespace InferenceEngine;
|
||||
@ -86,16 +87,10 @@ void ReduceCPULayerTest::SetUp() {
|
||||
std::tie(axes, opType, keepDims, reductionType, netPrecision, inPrc, outPrc, inputShapes) = basicParams;
|
||||
if (netPrecision == ElementType::boolean) {
|
||||
inPrc = outPrc = netPrecision;
|
||||
} else {
|
||||
if (additionalConfig[ov::hint::inference_precision.name()] == ov::element::bf16) {
|
||||
inPrc = outPrc = netPrecision = ElementType::bf16;
|
||||
} else if (additionalConfig[ov::hint::inference_precision.name()] == ov::element::f16) {
|
||||
inPrc = outPrc = netPrecision = ElementType::f16;
|
||||
} else {
|
||||
inPrc = outPrc = netPrecision;
|
||||
}
|
||||
}
|
||||
|
||||
configuration.insert(additionalConfig.begin(), additionalConfig.end());
|
||||
updateSelectedType(getPrimitiveType(), netPrecision == ElementType::boolean ? ElementType::i8 : netPrecision, configuration);
|
||||
|
||||
init_input_shapes(inputShapes);
|
||||
|
||||
@ -120,9 +115,6 @@ void ReduceCPULayerTest::SetUp() {
|
||||
|
||||
const auto reduce = ngraph::builder::makeReduce(paramOuts[0], reductionAxesNode, keepDims, reductionType);
|
||||
|
||||
selectedType = getPrimitiveType() + "_" +
|
||||
(inPrc == ElementType::boolean ? "I8" : InferenceEngine::details::convertPrecision(inPrc).name());
|
||||
|
||||
// hybrid layouts
|
||||
if (inFmts.size() != 0 && outFmts.size() == 0) {
|
||||
size_t outShapeSize = inputDynamicShapes[0].size() - axes.size();
|
||||
|
@ -55,7 +55,7 @@ void TransposeLayerCPUTest::SetUp() {
|
||||
|
||||
std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
|
||||
|
||||
selectedType = makeSelectedTypeStr("unknown", inType);
|
||||
updateSelectedType("unknown", inType, configuration);
|
||||
|
||||
init_input_shapes({inputShapes});
|
||||
|
||||
|
@ -4,7 +4,9 @@
|
||||
|
||||
#include "cpu_test_utils.hpp"
|
||||
#include "ie_ngraph_utils.hpp"
|
||||
#include "openvino/core/type/element_type.hpp"
|
||||
#include "utils/rt_info/memory_formats_attribute.hpp"
|
||||
#include "utils/general_utils.h"
|
||||
#include <cstdint>
|
||||
|
||||
namespace CPUTestUtils {
|
||||
@ -271,7 +273,6 @@ std::string CPUTestsBase::getPrimitiveType() const {
|
||||
}
|
||||
return isaType;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
std::string CPUTestsBase::getISA(bool skip_amx) const {
|
||||
@ -352,6 +353,32 @@ std::string CPUTestsBase::makeSelectedTypeStr(std::string implString, ngraph::el
|
||||
return implString;
|
||||
}
|
||||
|
||||
void CPUTestsBase::updateSelectedType(const std::string& primitiveType, const ov::element::Type netType, const ov::AnyMap& config) {
|
||||
auto getExecType = [&](){
|
||||
// inference_precision affects only floating point type networks
|
||||
if (!netType.is_real())
|
||||
return netType;
|
||||
|
||||
const auto it = config.find(ov::hint::inference_precision.name());
|
||||
if (it == config.end())
|
||||
return netType;
|
||||
|
||||
const auto inference_precision_type = it->second.as<ov::element::Type>();
|
||||
// currently plugin only allows to change precision from higher to lower (i.e. f32 -> f16 or f32 -> bf16)
|
||||
if (netType.bitwidth() < inference_precision_type.bitwidth()) {
|
||||
return netType;
|
||||
}
|
||||
|
||||
return inference_precision_type;
|
||||
};
|
||||
|
||||
const auto execType = getExecType();
|
||||
|
||||
selectedType = primitiveType;
|
||||
selectedType.push_back('_');
|
||||
selectedType += InferenceEngine::details::convertPrecision(execType).name();
|
||||
}
|
||||
|
||||
std::vector<CPUSpecificParams> filterCPUSpecificParams(const std::vector<CPUSpecificParams> ¶msVector) {
|
||||
auto adjustBlockedFormatByIsa = [](std::vector<cpu_memory_format_t>& formats) {
|
||||
for (auto& format : formats) {
|
||||
|
@ -128,6 +128,7 @@ public:
|
||||
const std::vector<std::string>& priority);
|
||||
//TODO: change to setter method
|
||||
static std::string makeSelectedTypeStr(std::string implString, ngraph::element::Type_t elType);
|
||||
void updateSelectedType(const std::string& primitiveType, const ov::element::Type netType, const ov::AnyMap& config);
|
||||
|
||||
CPUInfo getCPUInfo() const;
|
||||
std::shared_ptr<ngraph::Function> makeNgraphFunction(const ngraph::element::Type &ngPrc,
|
||||
|
@ -41,7 +41,7 @@ std::string EltwiseLayerTest::getTestCaseName(const testing::TestParamInfo<Eltwi
|
||||
results << "OutType=" << outType << "_";
|
||||
results << "trgDev=" << targetName;
|
||||
for (auto const& configItem : additional_config) {
|
||||
results << "_configItem=" << configItem.first << "_";
|
||||
results << "_configItem=" << configItem.first << "=";
|
||||
configItem.second.print(results);
|
||||
}
|
||||
return results.str();
|
||||
|
Loading…
Reference in New Issue
Block a user