[CPU] ARM architecture support (#15256)

* [CPU] ARM architecture support

This patch extends existing CPU plugin capabilities with ARM CPUs optimized support
This commit is contained in:
Gorokhov Dmitriy 2023-04-12 18:42:05 +04:00 committed by GitHub
parent a368e10fff
commit c283d21215
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
238 changed files with 5620 additions and 1056 deletions

3
.gitmodules vendored
View File

@ -69,3 +69,6 @@
[submodule "thirdparty/snappy"]
path = thirdparty/snappy
url = https://github.com/google/snappy.git
[submodule "ARMComputeLibrary"]
path = src/plugins/intel_cpu/thirdparty/ComputeLibrary
url = https://github.com/ARM-software/ComputeLibrary.git

View File

@ -6,7 +6,7 @@
# Common cmake options
#
ie_dependent_option (ENABLE_INTEL_CPU "CPU plugin for OpenVINO Runtime" ON "RISCV64 OR X86 OR X86_64" OFF)
ie_dependent_option (ENABLE_INTEL_CPU "CPU plugin for OpenVINO Runtime" ON "RISCV64 OR X86 OR X86_64 OR AARCH64" OFF)
ie_option (ENABLE_TESTS "unit, behavior and functional tests" OFF)

View File

@ -38,6 +38,43 @@ if(ENABLE_TESTS)
add_subdirectory(tests)
endif()
add_definitions(-DOV_CPU_WITH_DNNL)
set(OV_CPU_WITH_DNNL ON)
if(DNNL_AARCH64_USE_ACL)
add_definitions(-DOV_CPU_WITH_ACL)
set(OV_CPU_WITH_ACL ON)
endif()
if(OV_CPU_WITH_ACL)
set(CMAKE_CXX_STANDARD 14)
endif()
# remove target specific files from compilation
if (NOT OV_CPU_WITH_ACL)
set(EXCLUDE_PATHS ${EXCLUDE_PATHS}
${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/executors/acl/*)
endif()
if (NOT X86_64)
set(EXCLUDE_PATHS ${EXCLUDE_PATHS}
${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/executors/x64/*
${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/kernels/x64/*
${CMAKE_CURRENT_SOURCE_DIR}/src/emitters/x64/*
${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/cpu_opset/x64/*
${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/snippets/x64/*)
endif()
if (NOT AARCH64)
set(EXCLUDE_PATHS ${EXCLUDE_PATHS}
${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/cpu_opset/arm/*)
endif()
file(GLOB_RECURSE FILES_TO_REMOVE ${EXCLUDE_PATHS})
list(REMOVE_ITEM SOURCES ${FILES_TO_REMOVE})
list(REMOVE_ITEM HEADERS ${FILES_TO_REMOVE})
# create plugin
ie_add_plugin(NAME ${TARGET_NAME}

View File

@ -27,4 +27,4 @@ CPU Plugin contains the following components:
* [OpenVINO™ README](../../../README.md)
* [OpenVINO Core Components](../../README.md)
* [OpenVINO Plugins](../README.md)
* [Developer documentation](../../../docs/dev/index.md)
* [Developer documentation](../../../docs/dev/index.md)

View File

@ -79,6 +79,8 @@ MultiCache::EntryPtr<KeyType, ValueType> MultiCache::getEntry() {
return std::static_pointer_cast<EntryType>(itr->second);
}
using MultiCacheWeakPtr = std::weak_ptr<MultiCache>;
using MultiCacheWeakCPtr = std::weak_ptr<const MultiCache>;
using MultiCachePtr = std::shared_ptr<MultiCache>;
using MultiCacheCPtr = std::shared_ptr<const MultiCache>;

View File

@ -263,6 +263,11 @@ void Config::readProperties(const std::map<std::string, std::string> &prop) {
if (exclusiveAsyncRequests) // Exclusive request feature disables the streams
streamExecutorConfig._streams = 1;
#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
// TODO: multi-stream execution has functional issues on ARM target
streamExecutorConfig._streams = 1;
#endif
CPU_DEBUG_CAP_ENABLE(applyDebugCapsProperties());
updateProperties();
}

View File

@ -48,7 +48,12 @@ struct Config {
std::string device_id = {};
int batchLimit = 0;
float fcSparseWeiDecompressionRate = 1.0f;
#if defined(OPENVINO_ARCH_X86_64)
size_t rtCacheCapacity = 5000ul;
#else
// TODO: Executor cache may leads to incorrect behavior on oneDNN ACL primitives
size_t rtCacheCapacity = 0ul;
#endif
InferenceEngine::IStreamsExecutor::Config streamExecutorConfig;
InferenceEngine::PerfHintsConfig perfHintsConfig;
bool enableCpuPinning = true;

View File

@ -194,7 +194,7 @@ public:
}
enum : Dim {
UNDEFINED_DIM = 0xffffffffffffffff
UNDEFINED_DIM = std::numeric_limits<Dim>::max()
};
private:

View File

@ -9,9 +9,6 @@
namespace ov {
namespace intel_cpu {
using Dim = std::size_t;
using VectorDims = std::vector<Dim>;
const InferenceEngine::details::caseless_unordered_map<std::string, Type> type_to_name_tbl = {
{ "Constant", Type::Input },
{ "Parameter", Type::Input },
@ -69,6 +66,7 @@ const InferenceEngine::details::caseless_unordered_map<std::string, Type> type_t
{ "SoftPlus", Type::Eltwise },
{ "SoftSign", Type::Eltwise },
{ "Select", Type::Eltwise},
{ "Log", Type::Eltwise },
{ "Reshape", Type::Reshape },
{ "Squeeze", Type::Reshape },
{ "Unsqueeze", Type::Reshape },
@ -163,7 +161,6 @@ const InferenceEngine::details::caseless_unordered_map<std::string, Type> type_t
{ "Floor", Type::Math},
{ "HardSigmoid", Type::Math},
{ "If", Type::If},
{ "Log", Type::Math},
{ "Neg", Type::Math},
{ "Reciprocal", Type::Math},
{ "Selu", Type::Math},
@ -448,7 +445,8 @@ std::string algToString(const Algorithm alg) {
CASE(EltwiseLogicalXor);
CASE(EltwiseLogicalNot);
CASE(EltwiseRelu);
CASE(EltwiseGelu);
CASE(EltwiseGeluErf);
CASE(EltwiseGeluTanh);
CASE(EltwiseElu);
CASE(EltwiseTanh);
CASE(EltwiseSelect);
@ -466,10 +464,10 @@ std::string algToString(const Algorithm alg) {
CASE(EltwiseRoundHalfToEven);
CASE(EltwiseRoundHalfAwayFromZero);
CASE(EltwiseErf);
CASE(EltwiseLog);
CASE(FQCommon);
CASE(FQQuantization);
CASE(FQBinarization);
CASE(FQRequantization);
CASE(ROIPoolingMax);
CASE(ROIPoolingBilinear);
CASE(ROIAlignMax);
@ -502,7 +500,6 @@ std::string algToString(const Algorithm alg) {
CASE(MathErf);
CASE(MathFloor);
CASE(MathHardSigmoid);
CASE(MathLog);
CASE(MathNegative);
CASE(MathReciprocal);
CASE(MathSelu);

View File

@ -160,7 +160,8 @@ enum class Algorithm {
EltwiseLogicalXor,
EltwiseLogicalNot,
EltwiseRelu,
EltwiseGelu,
EltwiseGeluErf,
EltwiseGeluTanh,
EltwiseElu,
EltwiseTanh,
EltwiseSigmoid,
@ -179,12 +180,12 @@ enum class Algorithm {
EltwiseRoundHalfAwayFromZero,
EltwiseErf,
EltwiseSoftSign,
EltwiseLog,
// FakeQuantize algorithms
FQCommon,
FQQuantization,
FQBinarization,
FQRequantization,
// ROIPooling algorithms
ROIPoolingMax,
@ -227,7 +228,6 @@ enum class Algorithm {
MathErf,
MathFloor,
MathHardSigmoid,
MathLog,
MathNegative,
MathReciprocal,
MathSelu,

View File

@ -203,5 +203,80 @@ const char* DnnlExtensionUtils::query_pd_info(const_dnnl_primitive_desc_t pd) {
return pd->info();
}
dnnl::algorithm DnnlExtensionUtils::convertToDnnlAlgorithm(Algorithm alg) {
switch (alg) {
case Algorithm::EltwiseRelu: return dnnl::algorithm::eltwise_relu;
case Algorithm::EltwiseTanh: return dnnl::algorithm::eltwise_tanh;
case Algorithm::EltwiseElu: return dnnl::algorithm::eltwise_elu;
case Algorithm::EltwiseAbs: return dnnl::algorithm::eltwise_abs;
case Algorithm::EltwiseSqrt: return dnnl::algorithm::eltwise_sqrt;
case Algorithm::EltwiseSwish: return dnnl::algorithm::eltwise_swish;
case Algorithm::EltwiseHswish: return dnnl::algorithm::eltwise_hardswish;
case Algorithm::EltwiseSoftRelu: return dnnl::algorithm::eltwise_soft_relu;
case Algorithm::EltwiseMish: return dnnl::algorithm::eltwise_mish;
case Algorithm::EltwiseExp: return dnnl::algorithm::eltwise_exp;
case Algorithm::EltwiseGeluErf: return dnnl::algorithm::eltwise_gelu_erf;
case Algorithm::EltwiseGeluTanh: return dnnl::algorithm::eltwise_gelu_tanh;
case Algorithm::EltwiseSigmoid: return dnnl::algorithm::eltwise_logistic;
case Algorithm::EltwiseClamp: return dnnl::algorithm::eltwise_clip;
case Algorithm::EltwisePowerStatic: return dnnl::algorithm::eltwise_pow;
case Algorithm::EltwiseHsigmoid: return dnnl::algorithm::eltwise_hsigmoid;
case Algorithm::EltwiseRoundHalfToEven: return dnnl::algorithm::eltwise_round_half_to_even;
case Algorithm::EltwiseRoundHalfAwayFromZero: return dnnl::algorithm::eltwise_round_half_away_from_zero;
case Algorithm::EltwiseAdd: return dnnl::algorithm::binary_add;
case Algorithm::EltwiseMultiply: return dnnl::algorithm::binary_mul;
case Algorithm::EltwiseSubtract: return dnnl::algorithm::binary_sub;
case Algorithm::EltwiseDivide: return dnnl::algorithm::binary_div;
case Algorithm::EltwiseMaximum: return dnnl::algorithm::binary_max;
case Algorithm::EltwiseMinimum: return dnnl::algorithm::binary_min;
case Algorithm::EltwiseEqual: return dnnl::algorithm::binary_eq;
case Algorithm::EltwiseNotEqual: return dnnl::algorithm::binary_ne;
case Algorithm::EltwiseGreater: return dnnl::algorithm::binary_gt;
case Algorithm::EltwiseGreaterEqual: return dnnl::algorithm::binary_ge;
case Algorithm::EltwiseLess: return dnnl::algorithm::binary_lt;
case Algorithm::EltwiseLessEqual: return dnnl::algorithm::binary_le;
case Algorithm::EltwisePrelu: return dnnl::algorithm::binary_prelu;
case Algorithm::ReduceMax: return dnnl::algorithm::reduction_max;
case Algorithm::ReduceMin: return dnnl::algorithm::reduction_min;
case Algorithm::ReduceSum: return dnnl::algorithm::reduction_sum;
case Algorithm::ReduceMean: return dnnl::algorithm::reduction_mean;
case Algorithm::FQCommon: return dnnl::algorithm::quantization_quantize_dequantize;
case Algorithm::FQQuantization: return dnnl::algorithm::quantization_quantize;
case Algorithm::FQBinarization: return dnnl::algorithm::binarization_depthwise;
default: return dnnl::algorithm::undef;
}
}
bool DnnlExtensionUtils::isUnarySupportedAsPostOp(Algorithm alg) {
#if defined(OV_CPU_WITH_ACL)
return one_of(alg, Algorithm::EltwiseRelu,
Algorithm::EltwiseTanh,
Algorithm::EltwiseElu,
Algorithm::EltwiseAbs,
Algorithm::EltwiseSqrt,
Algorithm::EltwiseSoftRelu,
Algorithm::EltwiseSigmoid);
#elif defined(OPENVINO_ARCH_X86_64)
return one_of(alg, Algorithm::EltwiseRelu,
Algorithm::EltwiseGeluErf,
Algorithm::EltwiseGeluTanh,
Algorithm::EltwiseElu,
Algorithm::EltwiseSigmoid,
Algorithm::EltwiseClamp,
Algorithm::EltwiseTanh,
Algorithm::EltwiseSwish,
Algorithm::EltwiseHswish,
Algorithm::EltwiseMish,
Algorithm::EltwiseHsigmoid,
Algorithm::EltwiseRoundHalfToEven,
Algorithm::EltwiseRoundHalfAwayFromZero,
Algorithm::EltwiseAbs,
Algorithm::EltwiseSqrt,
Algorithm::EltwiseSoftRelu);
#else
return false;
#endif
}
} // namespace intel_cpu
} // namespace ov

View File

@ -57,6 +57,8 @@ public:
static bool hasProperImplementationType(dnnl::primitive_desc& desc, impl_desc_type implType);
static dnnl_memory_desc_t clone_desc(const_dnnl_memory_desc_t cdesc);
static const char* query_pd_info(const_dnnl_primitive_desc_t pd);
static dnnl::algorithm convertToDnnlAlgorithm(Algorithm alg);
static bool isUnarySupportedAsPostOp(Algorithm alg);
};
} // namespace intel_cpu

View File

@ -15,12 +15,13 @@
#include "jit_dnnl_ext_emitters.hpp"
#include "jit_conversion_emitters.hpp"
#include "snippets_transformations/op/load_convert.hpp"
#include "snippets_transformations/op/store_convert.hpp"
#include "snippets_transformations/op/fused_mul_add.hpp"
#include "snippets_transformations/op/brgemm_copy_b.hpp"
#include "snippets_transformations/op/brgemm_cpu.hpp"
#include "ngraph_transformations/op/swish_cpu.hpp"
#include "transformations/snippets/x64/op/load_convert.hpp"
#include "transformations/snippets/x64/op/store_convert.hpp"
#include "transformations/snippets/x64/op/fused_mul_add.hpp"
#include "transformations/snippets/x64/op/brgemm_copy_b.hpp"
#include "transformations/snippets/x64/op/brgemm_cpu.hpp"
#include "snippets/op/brgemm.hpp"
#include "transformations/cpu_opset/common/op/swish_cpu.hpp"
#include <ngraph/opsets/opset5.hpp>

View File

@ -5,7 +5,7 @@
#pragma once
#include "ngraph/opsets/opset5.hpp"
#include "ngraph_transformations/op/swish_cpu.hpp"
#include "transformations/cpu_opset/common/op/swish_cpu.hpp"
#include "jit_dnnl_emitters.hpp"
namespace ov {

View File

@ -8,8 +8,8 @@
#include "jit_snippets_emitters.hpp"
#include "snippets/op/subgraph.hpp"
#include "snippets/utils.hpp"
#include "snippets_transformations/op/brgemm_copy_b.hpp"
#include "snippets_transformations/op/brgemm_cpu.hpp"
#include "transformations/snippets/x64/op/brgemm_copy_b.hpp"
#include "transformations/snippets/x64/op//brgemm_cpu.hpp"
using namespace InferenceEngine;
using ngraph::snippets::op::Subgraph;

View File

@ -10,7 +10,7 @@
#include "jit_emitter.hpp"
#include "jit_load_store_emitters.hpp"
#include "snippets_transformations/op/store_convert.hpp"
#include "transformations/snippets/x64/op/store_convert.hpp"
// Matmul support:
#include <cpu/x64/brgemm/brgemm.hpp>
#include <cpu/x64/matmul/brgemm_matmul_copy_utils.hpp>

View File

@ -3,17 +3,17 @@
//
#include "extension.h"
#include "ngraph_transformations/op/fully_connected.hpp"
#include "ngraph_transformations/op/interaction.hpp"
#include "ngraph_transformations/op/leaky_relu.hpp"
#include "ngraph_transformations/op/power_static.hpp"
#include "ngraph_transformations/op/swish_cpu.hpp"
#include "ngraph_transformations/op/mha.hpp"
#include "ngraph_transformations/op/ngram.hpp"
#include "snippets_transformations/op/load_convert.hpp"
#include "snippets_transformations/op/store_convert.hpp"
#include "snippets_transformations/op/brgemm_cpu.hpp"
#include "snippets_transformations/op/brgemm_copy_b.hpp"
#include "transformations/cpu_opset/common/op/fully_connected.hpp"
#include "transformations/cpu_opset/common/op/leaky_relu.hpp"
#include "transformations/cpu_opset/common/op/power_static.hpp"
#include "transformations/cpu_opset/common/op/swish_cpu.hpp"
#include "transformations/cpu_opset/common/op/ngram.hpp"
#include "transformations/cpu_opset/x64/op/mha.hpp"
#include "transformations/cpu_opset/x64/op/interaction.hpp"
#include "transformations/snippets/x64/op/load_convert.hpp"
#include "transformations/snippets/x64/op/store_convert.hpp"
#include "transformations/snippets/x64/op/brgemm_cpu.hpp"
#include "transformations/snippets/x64/op/brgemm_copy_b.hpp"
#include <ngraph/ngraph.hpp>
#include <ov_ops/augru_cell.hpp>
@ -46,20 +46,20 @@ std::map<std::string, ngraph::OpSet> Extension::getOpSets() {
auto cpu_plugin_opset = []() {
ngraph::OpSet opset;
#if defined(OPENVINO_ARCH_X86_64)
#define NGRAPH_OP_X64(NAME, NAMESPACE) NGRAPH_OP(NAME, NAMESPACE)
#else
#define NGRAPH_OP_X64(NAME, NAMESPACE)
#endif
#define NGRAPH_OP(NAME, NAMESPACE) opset.insert<NAMESPACE::NAME>();
NGRAPH_OP(InteractionNode, ov::intel_cpu)
NGRAPH_OP(FullyConnectedNode, ov::intel_cpu)
NGRAPH_OP(LeakyReluNode, ov::intel_cpu)
NGRAPH_OP(PowerStaticNode, ov::intel_cpu)
NGRAPH_OP(SwishNode, ov::intel_cpu)
NGRAPH_OP(MHANode, ov::intel_cpu)
NGRAPH_OP(NgramNode, ov::intel_cpu)
NGRAPH_OP(LoadConvertSaturation, ov::intel_cpu)
NGRAPH_OP(LoadConvertTruncation, ov::intel_cpu)
NGRAPH_OP(StoreConvertSaturation, ov::intel_cpu)
NGRAPH_OP(StoreConvertTruncation, ov::intel_cpu)
NGRAPH_OP(BrgemmCPU, ov::intel_cpu)
NGRAPH_OP(BrgemmCopyB, ov::intel_cpu)
NGRAPH_OP_X64(MHANode, ov::intel_cpu)
NGRAPH_OP_X64(InteractionNode, ov::intel_cpu)
#undef NGRAPH_OP
return opset;
@ -157,6 +157,12 @@ std::map<std::string, ngraph::OpSet> Extension::getOpSets() {
NGRAPH_OP(Store, ngraph::snippets::op)
NGRAPH_OP(Subgraph, ngraph::snippets::op)
NGRAPH_OP(VectorBuffer, ngraph::snippets::op)
NGRAPH_OP_X64(LoadConvertSaturation, ov::intel_cpu)
NGRAPH_OP_X64(LoadConvertTruncation, ov::intel_cpu)
NGRAPH_OP_X64(StoreConvertSaturation, ov::intel_cpu)
NGRAPH_OP_X64(StoreConvertTruncation, ov::intel_cpu)
NGRAPH_OP_X64(BrgemmCPU, ov::intel_cpu)
NGRAPH_OP_X64(BrgemmCopyB, ov::intel_cpu)
#undef NGRAPH_OP
return opset;

View File

@ -988,21 +988,7 @@ void GraphOptimizer::FuseConvolutionAndSimpleOperationThroughMaxPool(Graph &grap
continue;
}
if (!one_of(fuseCandidate->getAlgorithm(), Algorithm::EltwiseRelu,
Algorithm::EltwiseGelu,
Algorithm::EltwiseElu,
Algorithm::EltwiseSigmoid,
Algorithm::EltwiseClamp,
Algorithm::EltwiseTanh,
Algorithm::EltwiseSwish,
Algorithm::EltwiseHswish,
Algorithm::EltwiseMish,
Algorithm::EltwiseHsigmoid,
Algorithm::EltwiseRoundHalfToEven,
Algorithm::EltwiseRoundHalfAwayFromZero,
Algorithm::EltwiseAbs,
Algorithm::EltwiseSqrt,
Algorithm::EltwiseSoftRelu)) {
if (!DnnlExtensionUtils::isUnarySupportedAsPostOp(fuseCandidate->getAlgorithm())) {
parent++;
continue;
}
@ -1175,17 +1161,7 @@ void GraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(Graph &graph)
auto isFusingSupported = [&](NodePtr conv, NodePtr child) {
return child->getType() == Type::Eltwise &&
one_of(child->getAlgorithm(), Algorithm::EltwiseRelu,
Algorithm::EltwiseElu,
Algorithm::EltwiseSigmoid,
Algorithm::EltwiseClamp,
Algorithm::EltwiseSwish,
Algorithm::EltwiseHswish,
Algorithm::EltwiseMish,
Algorithm::EltwiseHsigmoid,
Algorithm::EltwiseRoundHalfToEven,
Algorithm::EltwiseRoundHalfAwayFromZero,
Algorithm::EltwiseSoftRelu);
DnnlExtensionUtils::isUnarySupportedAsPostOp(child->getAlgorithm());
};
for (auto &graphNode : graphNodes) {

View File

@ -1,56 +0,0 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <ngraph/pass/constant_folding.hpp>
#include "fc_bias_fusion.hpp"
#include "ngraph/op/fake_quantize.hpp"
#include "ngraph/pass/manager.hpp"
#include "reshape_fc_fusion.hpp"
#include "align_matmul_input_ranks.hpp"
#include "transformations/common_optimizations/reshape_prelu.hpp"
#include "convert_broadcast_to_tiles.hpp"
#include "convert_tile_to_seq_tiles.hpp"
#include "convert_matmul_to_fc.hpp"
#include "convert_to_power_static.hpp"
#include "convert_to_leaky_relu.hpp"
#include "convert_to_swish_cpu.hpp"
#include "transformations/convert_precision.hpp"
#include "transformations/utils/utils.hpp"
#include "rnn_sequences_optimization.hpp"
#include "transformations/common_optimizations/reshape_sequence_fusion.hpp"
#include "ngram_fusion.hpp"
#include "itt.hpp"
namespace ov {
namespace intel_cpu {
inline void ConvertToCPUSpecificOpset(std::shared_ptr<ngraph::Function> &nGraphFunc) {
RUN_ON_FUNCTION_SCOPE(ConvertToCPUSpecificOpset);
ngraph::pass::Manager manager;
manager.set_per_pass_validation(false);
manager.register_pass<ConvertMatMulToFC>();
manager.register_pass<AlignMatMulInputRanks>();
manager.register_pass<ConvertTileToSeqTiles>();
manager.register_pass<FullyConnectedBiasFusion>();
manager.register_pass<ConvertToPowerStatic>();
manager.register_pass<ConvertToLeakyRelu>();
manager.register_pass<ConvertToSwishCPU>();
manager.register_pass<OptimizeSequenceTransposes>();
if (!ov::op::util::has_op_with_type<ngraph::op::FakeQuantize>(nGraphFunc)) {
manager.register_pass<ReshapeFullyConnectedFusion>();
}
// after transformation "MoveEltwiseUpThroughDataMov" there can be Reshape sequences that should be eliminated or fused
manager.register_pass<ov::pass::ReshapeSequenceFusion>();
manager.register_pass<ngraph::pass::ConstantFolding>();
manager.register_pass<ov::pass::ConvertPrecision>(precisions_map {{ ngraph::element::i64, ngraph::element::i32 }});
manager.register_pass<NgramFusion>();
manager.register_pass<ov::pass::Validate>();
manager.run_passes(nGraphFunc);
}
} // namespace intel_cpu
} // namespace ov

View File

@ -455,6 +455,7 @@ std::string Node::getPrimitiveDescriptorType() {
SEARCH_TYPE(winograd);
SEARCH_TYPE(sparse);
SEARCH_TYPE(acl);
SEARCH_TYPE(_dw);
SEARCH_TYPE(_1x1);
@ -959,6 +960,9 @@ void Node::cleanup() {
const std::vector<impl_desc_type>& Node::getPrimitivesPriority() {
std::vector<impl_desc_type> priorities = {
impl_desc_type::unknown,
// Undef impl type is used to express use-cases there real type is unkown during compilation
// Undef has higher priority than defined types in order to force primitive selection logic to make decision based on other properties
impl_desc_type::undef,
impl_desc_type::brgconv_avx512_amx_1x1,
impl_desc_type::brgconv_avx512_amx,
impl_desc_type::jit_avx512_amx_dw,
@ -988,6 +992,7 @@ const std::vector<impl_desc_type>& Node::getPrimitivesPriority() {
impl_desc_type::gemm_avx2,
impl_desc_type::gemm_avx,
impl_desc_type::gemm_sse42,
impl_desc_type::acl,
impl_desc_type::jit_gemm,
impl_desc_type::ref_any,
impl_desc_type::ref,
@ -1340,6 +1345,7 @@ Node* Node::NodesFactory::create(const std::shared_ptr<ngraph::Node>& op, const
}
bool Node::canBePerformedAsScaleShift(const Node *parentNode) const {
#if defined(OPENVINO_ARCH_X86_64)
IE_ASSERT(parentNode);
size_t fusingPort = 0;
@ -1390,6 +1396,10 @@ bool Node::canBePerformedAsScaleShift(const Node *parentNode) const {
Algorithm::EltwisePrelu,
Algorithm::EltwiseMulAdd) && isBroadcastableToDataInput())
|| isConvertablePowerStatic();
#else
// TODO: provide correct list of operations for other backends
return false;
#endif
}
// @todo shifts for Subtract and scales for Divide are replaced with
@ -1606,22 +1616,7 @@ bool Node::canFuseSimpleOperation(const NodePtr& node) const {
}
return ret;
} else if (node->getType() == Type::Eltwise) {
return one_of(node->getAlgorithm(),
Algorithm::EltwiseRelu,
Algorithm::EltwiseGelu,
Algorithm::EltwiseElu,
Algorithm::EltwiseSigmoid,
Algorithm::EltwiseClamp,
Algorithm::EltwiseTanh,
Algorithm::EltwiseSwish,
Algorithm::EltwiseHswish,
Algorithm::EltwiseMish,
Algorithm::EltwiseHsigmoid,
Algorithm::EltwiseRoundHalfToEven,
Algorithm::EltwiseRoundHalfAwayFromZero,
Algorithm::EltwiseAbs,
Algorithm::EltwiseSqrt,
Algorithm::EltwiseSoftRelu) ||
return DnnlExtensionUtils::isUnarySupportedAsPostOp(node->getAlgorithm()) ||
node->canBePerformedAsScaleShift(this);
}
return false;

View File

@ -37,6 +37,8 @@
#include "dnnl_postops_composer.h"
#include "graph_context.h"
#include "nodes/executors/mvn_list.hpp"
#include "nodes/executors/executor.hpp"
namespace ov {
namespace intel_cpu {
@ -75,6 +77,12 @@ class NodeDesc {
public:
NodeDesc(const NodeConfig& conf, impl_desc_type type): config(conf) {
implementationType = type;
executorFactory = nullptr;
}
NodeDesc(const NodeConfig& conf, impl_desc_type type, ExecutorFactoryPtr factory): config(conf) {
implementationType = type;
executorFactory = factory;
}
const NodeConfig& getConfig() const {
@ -93,9 +101,28 @@ public:
implementationType = type;
}
ExecutorFactoryPtr getExecutorFactory() const {
return executorFactory;
}
template <typename T,
typename std::enable_if<!std::is_pointer<T>::value && !std::is_reference<T>::value, int>::type = 0,
typename std::enable_if<std::is_base_of<ExecutorFactory, T>::value, int>::type = 0>
std::shared_ptr<T> getExecutorFactoryAs() {
auto casted = std::dynamic_pointer_cast<T>(executorFactory);
if (!casted)
IE_THROW() << "Cannot dynamically cast ExecutorFactory";
return casted;
}
void setExecutorFactory(ExecutorFactoryPtr factory) {
executorFactory = factory;
}
private:
NodeConfig config;
impl_desc_type implementationType;
ExecutorFactoryPtr executorFactory;
};
class Node {

View File

@ -42,7 +42,7 @@ using namespace Xbyak;
namespace ov {
namespace intel_cpu {
namespace node {
#if defined(OPENVINO_ARCH_X86_64)
#define GET_OFF(field) offsetof(jit_bin_conv_call_args, field)
template <cpu_isa_t isa>
@ -874,7 +874,7 @@ private:
}
}
};
#endif
bool BinaryConvolution::isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
try {
if (isDynamicNgraphNode(op)) {
@ -1092,7 +1092,7 @@ void BinaryConvolution::createPrimitive() {
IMPLICATION(jcp.kw > 7, (jcp.t_pad == 0 && jcp.l_pad == 0) || (jcp.stride_w == 1 && jcp.stride_h == 1));
if (!args_ok)
IE_THROW() << "BinaryConvolution with name '" << getName() << "' has unsupported parameters";
#if defined(OPENVINO_ARCH_X86_64)
if (implType == impl_desc_type::jit_avx512) {
bin_conv_kernel.reset(new jit_uni_bin_conv_kernel_f32<x64::avx512_core>(jcp, jcp_dw_conv, *attr.get()));
} else if (implType == impl_desc_type::jit_avx2) {
@ -1102,6 +1102,7 @@ void BinaryConvolution::createPrimitive() {
}
if (bin_conv_kernel)
bin_conv_kernel->create_ker();
#endif
}
bool BinaryConvolution::canFuse(const NodePtr& node) const {

View File

@ -10,7 +10,7 @@
#include <openvino/op/i420_to_bgr.hpp>
#include <openvino/core/type.hpp>
#include <ie/ie_parallel.hpp>
#include <utils/jit_kernel.hpp>
#include "kernels/x64/jit_kernel.hpp"
using namespace InferenceEngine;
using namespace dnnl::impl;
@ -76,6 +76,7 @@ std::tuple<T, T, T> Converter::yuv_to_rgb(float y, float u, float v) {
return std::make_tuple(r, g, b);
}
#if defined(OPENVINO_ARCH_X86_64)
struct jit_uni_converter : public jit_kernel {
DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_converter)
@ -264,6 +265,7 @@ void jit_uni_converter::store_tail(const variable<T*> & dst,
copy<T>(ptr[dst], s.pointer(), copy_size);
}
#endif
namespace nv12 {
@ -394,6 +396,7 @@ public:
}
};
#if defined(OPENVINO_ARCH_X86_64)
template<typename T>
class JitConverter;
@ -611,7 +614,7 @@ public:
});
}
};
#endif
} // namespace nv12
namespace i420 {
@ -748,6 +751,7 @@ public:
}
};
#if defined(OPENVINO_ARCH_X86_64)
template<typename T>
class JitConverter;
@ -964,13 +968,13 @@ public:
});
}
};
#endif
} // namespace i420
/**
* Implements Color Convert shape inference algorithm. Depending on wether it has only single plain H dimension is
* passed through or recalculated as 2/3 of the initial size.
*
*
*/
class ColorConvertShapeInfer : public ShapeInferEmptyPads {
public:
@ -1098,6 +1102,7 @@ void ColorConvert::initSupportedNV12Impls() {
impls[Precision::FP32][false] = SUPPORTED_IMPL(TwoPlaneConvert, float, ref);
}
#if defined(OPENVINO_ARCH_X86_64)
// jit_uni
{
auto &impls = _supportedImpls[impl_desc_type::jit_uni][algorithm];
@ -1106,7 +1111,7 @@ void ColorConvert::initSupportedNV12Impls() {
impls[Precision::FP32][true] = SUPPORTED_IMPL(SinglePlaneConvert, float, jit_uni);
impls[Precision::FP32][false] = SUPPORTED_IMPL(TwoPlaneConvert, float, jit_uni);
}
#endif
#undef SUPPORTED_IMPL
}
@ -1125,6 +1130,7 @@ void ColorConvert::initSupportedI420Impls() {
impls[Precision::FP32][false] = SUPPORTED_IMPL(ThreePlaneConvert, float, ref);
}
#if defined(OPENVINO_ARCH_X86_64)
// jit_uni
{
auto &impls = _supportedImpls[impl_desc_type::jit_uni][algorithm];
@ -1133,7 +1139,7 @@ void ColorConvert::initSupportedI420Impls() {
impls[Precision::FP32][true] = SUPPORTED_IMPL(SinglePlaneConvert, float, jit_uni);
impls[Precision::FP32][false] = SUPPORTED_IMPL(ThreePlaneConvert, float, jit_uni);
}
#endif
#undef SUPPORTED_IMPL
}

View File

@ -7,25 +7,31 @@
#include <ie_parallel.hpp>
#include <utils/bfloat16.hpp>
#include <utils/general_utils.h>
#include <utils/jit_kernel.hpp>
#include <selective_build.h>
#include <openvino/core/type/float16.hpp>
#include <cpu/x64/jit_generator.hpp>
#include <algorithm>
#include <type_traits>
#include <tuple>
#include <cmath>
#include <onednn/dnnl.h>
#if defined(OPENVINO_ARCH_X86_64)
#include "nodes/kernels/x64/jit_kernel.hpp"
#include <cpu/x64/jit_generator.hpp>
#endif
using namespace InferenceEngine;
using namespace dnnl::impl::utils;
using namespace dnnl::impl::cpu::x64;
using namespace Xbyak;
namespace ov {
namespace intel_cpu {
namespace {
#if defined(OPENVINO_ARCH_X86_64)
using namespace dnnl::impl::utils;
using namespace dnnl::impl::cpu::x64;
using namespace Xbyak;
template <typename src_t, typename dst_t>
void convert_vec(jit_generator & gen,
const RegExp & src,
@ -156,6 +162,8 @@ void jit_convert(const TI* arg, TO* out, size_t count) {
}
}
#endif
template <Precision::ePrecision p>
struct PrecisionInfo {
using value_type = typename PrecisionTrait<p>::value_type;
@ -356,6 +364,7 @@ struct ConvertPrecision<std::tuple<ov::intel_cpu::bfloat16_t, float>> {
}
};
#if defined(OPENVINO_ARCH_X86_64)
template<typename src_t>
struct ConvertPrecision<std::tuple<src_t, ov::float16>> {
void operator()(ConvertContext & ctx) {
@ -462,6 +471,7 @@ struct ConvertPrecision<std::tuple<ov::float16, ov::float16>> {
ctx.converted = true;
}
};
#endif
} // namespace

View File

@ -26,6 +26,8 @@ using namespace Xbyak;
namespace ov {
namespace intel_cpu {
#if defined(OPENVINO_ARCH_X86_64)
template <cpu_isa_t isa>
struct jit_uni_permute_kernel_f32 : public jit_uni_permute_kernel, public jit_generator {
DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_permute_kernel_f32)
@ -141,6 +143,8 @@ private:
Xbyak::Xmm xmm = Xbyak::Xmm(1);
};
#endif // OPENVINO_ARCH_X86_64
PermuteKernel::PermuteKernel(const PermuteParams& params) : params(params) {
prepareParams();
}
@ -257,6 +261,7 @@ void PermuteKernel::prepareParams() {
jcp.ndims = sorted_order.size();
jcp.data_size = params.data_size;
#if defined(OPENVINO_ARCH_X86_64)
if (mayiuse(cpu::x64::avx512_core)) {
permute_kernel.reset(new jit_uni_permute_kernel_f32<cpu::x64::avx512_core>(jcp));
} else if (mayiuse(cpu::x64::avx2)) {
@ -264,6 +269,7 @@ void PermuteKernel::prepareParams() {
} else if (mayiuse(cpu::x64::sse41)) {
permute_kernel.reset(new jit_uni_permute_kernel_f32<cpu::x64::sse41>(jcp));
}
#endif // OPENVINO_ARCH_X86_64
if (permute_kernel)
permute_kernel->create_ker();

View File

@ -9,7 +9,7 @@
#include <cpu/x64/injectors/jit_uni_eltwise_injector.hpp>
#include <onednn/dnnl.h>
#include "utils/bfloat16.hpp"
#include "emitters/jit_bf16_emitters.hpp"
#include "emitters/x64/jit_bf16_emitters.hpp"
#include <algorithm>
#include <cassert>
@ -50,7 +50,7 @@ struct jit_uni_softmax_kernel {
virtual void create_ker() = 0;
};
#if defined(OPENVINO_ARCH_X86_64)
template <cpu_isa_t isa>
struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_generator {
DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_softmax_kernel_f32)
@ -226,7 +226,7 @@ private:
}
}
};
#endif
SoftmaxGeneric::SoftmaxGeneric(Precision inpPrc, Precision outPrc)
: input_prec(inpPrc), output_prec(outPrc) {
if (Precision::BF16 == output_prec) {
@ -236,6 +236,7 @@ SoftmaxGeneric::SoftmaxGeneric(Precision inpPrc, Precision outPrc)
}
block_size = 1;
#if defined(OPENVINO_ARCH_X86_64)
auto jcp = jit_softmax_config_params();
jcp.src_dt = inpPrc;
jcp.dst_dt = outPrc;
@ -252,12 +253,14 @@ SoftmaxGeneric::SoftmaxGeneric(Precision inpPrc, Precision outPrc)
}
if (softmax_kernel)
softmax_kernel->create_ker();
#endif
}
template<typename in_data_t, typename out_data_t>
void SoftmaxGeneric::calculate(const in_data_t *src_data, out_data_t *dst_data, int B, int C, int H, int W) {
for (int b = 0; b < B; b++) {
int tail_start = 0;
if (softmax_kernel) {
int blocks_num = H*W / block_size;

View File

@ -327,6 +327,9 @@ InferenceEngine::Precision Convolution::fusedEltwisePrecision(const NodePtr& fus
const std::vector<impl_desc_type>& Convolution::getPrimitivesPriority() {
std::vector<impl_desc_type> priorities = {
impl_desc_type::unknown,
impl_desc_type::dw_acl,
impl_desc_type::winograd_acl,
impl_desc_type::gemm_acl,
impl_desc_type::brgconv_avx512_amx_1x1,
impl_desc_type::brgconv_avx512_amx,
impl_desc_type::jit_avx512_amx_dw,
@ -556,6 +559,7 @@ void Convolution::getSupportedDescriptors() {
auto inputShape = getInputShapeAtPort(0);
auto outputShape = getOutputShapeAtPort(0);
#if defined(OPENVINO_ARCH_X86_64)
bool acceptedFormat = inputDataType == memory::data_type::bf16;
bool nspcAdded = false;
acceptedFormat |= (shouldTryBrgconv && inputDataType == memory::data_type::f32);
@ -594,6 +598,15 @@ void Convolution::getSupportedDescriptors() {
out_candidate = std::make_shared<DnnlBlockedMemoryDesc>(outputShape, outputDataType, nspc);
createDescriptor({ in_candidate }, { out_candidate });
}
#else
(void)ncsp;
(void)nCsp8c;
(void)nCsp16c;
in_candidate = std::make_shared<DnnlBlockedMemoryDesc>(inputShape, inputDataType, nspc);
out_candidate = std::make_shared<DnnlBlockedMemoryDesc>(outputShape, outputDataType, nspc);
createDescriptor({ in_candidate }, { out_candidate });
#endif
}
void Convolution::setPostOps(dnnl::primitive_attr& attr,
@ -899,7 +912,7 @@ void Convolution::createDescriptor(const std::vector<MemoryDescPtr>& inputDesc,
if (isWinograd())
algorithms.push_back(dnnl::algorithm::convolution_winograd);
algorithms.push_back(dnnl::algorithm::convolution_direct);
algorithms.push_back(baseConvAlgorithm);
updatePadding();
@ -1367,7 +1380,8 @@ void Convolution::prepareParams() {
getParentEdgeAt(1)->getParent()->isConstant()};
auto engine = getEngine();
auto builder = [&engine](const ConvKey& key) -> executorPtr {
auto convAlg = baseConvAlgorithm;
auto builder = [&engine, convAlg](const ConvKey& key) -> executorPtr {
// remove the requirement on weight memory layout to let primitive
// report the best layout for weight to be reordered dynamically at runtime
auto wghDescAny =
@ -1405,7 +1419,7 @@ void Convolution::prepareParams() {
attr);
};
const auto alg = (key.implType & impl_desc_type::winograd) ? dnnl::algorithm::convolution_winograd : dnnl::algorithm::convolution_direct;
const auto alg = (key.implType & impl_desc_type::winograd) ? dnnl::algorithm::convolution_winograd : convAlg;
dnnl::primitive_desc desc = createDnnlConvDesc(engine,
key.inp0->getDnnlDesc(),
wghDescAny,
@ -1419,6 +1433,7 @@ void Convolution::prepareParams() {
key.attr);
auto itpd = desc;
executorPtr execPtr = nullptr;
while (static_cast<bool>(itpd)) {
impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());
@ -1456,7 +1471,7 @@ void Convolution::prepareParams() {
key.dilation,
key.paddingL,
key.paddingR,
dnnl::algorithm::convolution_direct,
convAlg,
key.attr);
if (reorderConvDesc) {

View File

@ -171,6 +171,13 @@ private:
MemoryPtr stockInputZeroPointsMemPtr;
dnnl::memory::data_type outputDataType;
InferenceEngine::Precision sumPrc = InferenceEngine::Precision::UNSPECIFIED;
// TODO: migrate on convolution_auto algorithm for x64
#if defined(OPENVINO_ARCH_X86_64)
const dnnl::algorithm baseConvAlgorithm = dnnl::algorithm::convolution_direct;
#else
const dnnl::algorithm baseConvAlgorithm = dnnl::algorithm::convolution_auto;
#endif
};
} // namespace node

View File

@ -27,7 +27,7 @@ using namespace Xbyak;
namespace ov {
namespace intel_cpu {
namespace node {
#if defined(OPENVINO_ARCH_X86_64)
#define GET_OFF(field) offsetof(jit_def_conv_call_args, field)
template <cpu_isa_t isa>
@ -671,7 +671,7 @@ private:
pop(reg_sampled_offs);
}
};
#endif
bool DeformableConvolution::isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
try {
if (!one_of(op->get_type_info(),
@ -1033,7 +1033,7 @@ DeformableConvolution::DefConvExecutor::DefConvExecutor(const DefConvAttr &defCo
if (withModulation) {
modStrides = descVector[MOD_ID]->getStrides();
}
#if defined(OPENVINO_ARCH_X86_64)
const VectorDims srcDims = descVector[DATA_ID]->getShape().getStaticDims();
const VectorDims weiDims = descVector[WEI_ID]->getShape().getStaticDims();
const VectorDims dstDims = descVector[descVector.size() - 1]->getShape().getStaticDims();
@ -1084,11 +1084,13 @@ DeformableConvolution::DefConvExecutor::DefConvExecutor(const DefConvAttr &defCo
jcp.nb_oc_blocking = !mayiuse(cpu::x64::avx2) ? 2 : 4;
jcp.nthr = dnnl_get_max_threads();
#endif
}
DeformableConvolution::DefConvJitExecutor::DefConvJitExecutor(const DefConvAttr &defConvAttr,
const std::vector<std::shared_ptr<BlockedMemoryDesc>> &descVector) :
DefConvExecutor(defConvAttr, descVector) {
#if defined(OPENVINO_ARCH_X86_64)
if (mayiuse(cpu::x64::avx512_core)) {
def_conv_kernel.reset(new jit_uni_def_conv_kernel_f32<cpu::x64::avx512_core>(jcp));
} else if (mayiuse(cpu::x64::avx2)) {
@ -1103,6 +1105,7 @@ DeformableConvolution::DefConvJitExecutor::DefConvJitExecutor(const DefConvAttr
} else {
IE_THROW() << "Can't compile DefConvJitExecutor";
}
#endif
}
void DeformableConvolution::DefConvRefExecutor::exec(const float* src, const float* offsets,

View File

@ -123,7 +123,7 @@ inline float getImaginaryFromComplexProd(float lhsReal, float lhsImag, float rhs
/*
Returns true while we can iterate
Specified axis is skipped in counters
Specified axis is skipped in counters
*/
inline bool nextIterationStep(std::vector<size_t>& counters, const std::vector<size_t>& iterationRange, size_t axis) {
auto itCounter = counters.rbegin();
@ -535,7 +535,6 @@ void DFT::prepareParams() {
hasFFT = true;
}
}
if (mayiuse(cpu::x64::sse41)) {
createJITKernels(hasDFT, hasFFT);
}
@ -553,8 +552,8 @@ std::vector<int32_t> DFT::getAxes() const {
std::sort(axes.begin(), axes.end());
return axes;
}
void DFT::createJITKernels(bool hasDFT, bool hasFFT) {
#if defined(OPENVINO_ARCH_X86_64)
if (hasDFT && dftKernel == nullptr) {
if (mayiuse(cpu::x64::avx512_core)) {
dftKernel.reset(new jit_uni_dft_kernel_f32<cpu::x64::avx512_core>());
@ -584,8 +583,8 @@ void DFT::createJITKernels(bool hasDFT, bool hasFFT) {
if (fftKernel)
fftKernel->create_ker();
}
#endif
}
} // namespace node
} // namespace intel_cpu
} // namespace ov

View File

@ -8,7 +8,7 @@
#include <node.h>
#include <string>
#include "kernels/dft_uni_kernel.hpp"
#include "kernels/x64/dft_uni_kernel.hpp"
namespace ov {
namespace intel_cpu {
@ -31,7 +31,6 @@ public:
private:
std::vector<int32_t> getAxes() const;
void createJITKernels(bool hasDFT, bool hasFFT);
void dftNd(float* output,
const VectorDims& outputShape,
const VectorDims& outputStrides,

View File

@ -23,10 +23,10 @@
#include "input.h"
#include "common/cpu_convert.h"
#include "emitters/jit_emitter.hpp"
#include "emitters/jit_eltwise_emitters.hpp"
#include "emitters/jit_dnnl_emitters.hpp"
#include "emitters/jit_bf16_emitters.hpp"
#include "emitters/x64/jit_emitter.hpp"
#include "emitters/x64/jit_eltwise_emitters.hpp"
#include "emitters/x64/jit_dnnl_emitters.hpp"
#include "emitters/x64/jit_bf16_emitters.hpp"
#include <selective_build.h>
#include "utils/general_utils.h"
#include "utils/cpu_utils.hpp"
@ -34,9 +34,9 @@
#include "ngraph/ngraph.hpp"
#include <ngraph/opsets/opset1.hpp>
#include "ngraph_transformations/op/power_static.hpp"
#include "ngraph_transformations/op/leaky_relu.hpp"
#include "ngraph_transformations/op/swish_cpu.hpp"
#include "transformations/cpu_opset/common/op/power_static.hpp"
#include "transformations/cpu_opset/common/op/leaky_relu.hpp"
#include "transformations/cpu_opset/common/op/swish_cpu.hpp"
#include <string>
#include <vector>
@ -58,7 +58,8 @@ using namespace Xbyak;
namespace ov {
namespace intel_cpu {
namespace node {
namespace {
#if defined(OPENVINO_ARCH_X86_64)
template<typename T>
struct SupportedPrecisions {
@ -106,61 +107,7 @@ struct EltwiseEmitter<jit_is_inf_emitter> {
}
};
/**
* Implements Eltwise shape inference algorithm. The algorithm is based on broadcasting all the input shapes
* according to the NUMPY broadcast rule. This implementation is more lightweight than the ngraph one.
*
*/
class EltwiseShapeInfer : public ShapeInferEmptyPads {
public:
Result infer(
const std::vector<std::reference_wrapper<const VectorDims>>& input_shapes,
const std::unordered_map<size_t, MemoryPtr>& data_dependency) override {
size_t max_rank = 0;
size_t max_rank_idx = 0;
for (size_t i = 0; i < input_shapes.size(); ++i) {
auto item_rank = input_shapes[i].get().size();
if (item_rank > max_rank) {
max_rank = item_rank;
max_rank_idx = i;
}
}
auto output_shape = input_shapes[max_rank_idx].get();
// use NUMPY broadcast rule
for (size_t i = 0; i < input_shapes.size(); i++) {
if (i == max_rank_idx)
continue;
auto& input_shape = input_shapes[i].get();
if (input_shape.size() > output_shape.size()) {
IE_THROW() << "Eltwise shape infer input and output shapes rank mismatch";
}
size_t offset = output_shape.size() - input_shape.size();
for (size_t j = 0; j < input_shape.size(); ++j) {
if (input_shape[j] != output_shape[offset + j]) {
if (output_shape[offset + j] == 1) {
output_shape[offset + j] = input_shape[j];
} else {
if (input_shape[j] != 1) IE_THROW() << "Eltwise shape infer input shapes dim index: " << j << " mismatch";
}
}
}
}
return { { std::move(output_shape) }, ShapeInferStatus::success };
}
port_mask_t get_port_mask() const override {
return EMPTY_PORT_MASK;
}
};
class EltwiseShapeInferFactory : public ShapeInferFactory {
public:
ShapeInferPtr makeShapeInfer() const override {
return std::make_shared<EltwiseShapeInfer>();
}
};
void set_intersection(const std::set<std::vector<element::Type>>& precisions1,
static void set_intersection(const std::set<std::vector<element::Type>>& precisions1,
const std::set<std::vector<element::Type>>& precisions2,
std::set<std::vector<element::Type>>& intersection) {
std::map<element::Type, size_t> intersection_types;
@ -181,9 +128,6 @@ void set_intersection(const std::set<std::vector<element::Type>>& precisions1,
}
}
} // namespace
InferenceEngine::Precision eltwise_precision_helper::get_precision(const size_t inputs_number,
const InferenceEngine::Precision(&src_prc)[MAX_ELTWISE_INPUTS],
const std::vector<Eltwise::EltwiseData>& eltwise_data) {
@ -261,7 +205,8 @@ std::set<std::vector<element::Type>> eltwise_precision_helper::get_supported_pre
OV_SWITCH(intel_cpu, SupportedPrecisions, precisions, algo,
OV_CASE(Algorithm::EltwiseRelu, jit_dnnl_aux_emitter),
OV_CASE(Algorithm::EltwiseGelu, jit_dnnl_aux_emitter),
OV_CASE(Algorithm::EltwiseGeluErf, jit_dnnl_aux_emitter),
OV_CASE(Algorithm::EltwiseGeluTanh, jit_dnnl_aux_emitter),
OV_CASE(Algorithm::EltwiseElu, jit_dnnl_aux_emitter),
OV_CASE(Algorithm::EltwiseTanh, jit_dnnl_aux_emitter),
OV_CASE(Algorithm::EltwiseSigmoid, jit_dnnl_aux_emitter),
@ -633,7 +578,8 @@ private:
OV_SWITCH(intel_cpu, EltwiseEmitter, ctx, data.algo,
OV_CASE(Algorithm::EltwiseRelu, jit_dnnl_aux_emitter),
OV_CASE(Algorithm::EltwiseGelu, jit_dnnl_aux_emitter),
OV_CASE(Algorithm::EltwiseGeluErf, jit_dnnl_aux_emitter),
OV_CASE(Algorithm::EltwiseGeluTanh, jit_dnnl_aux_emitter),
OV_CASE(Algorithm::EltwiseElu, jit_dnnl_aux_emitter),
OV_CASE(Algorithm::EltwiseTanh, jit_dnnl_aux_emitter),
OV_CASE(Algorithm::EltwiseSigmoid, jit_dnnl_aux_emitter),
@ -972,6 +918,66 @@ private:
}
};
#endif // OPENVINO_ARCH_X86_64
namespace {
/**
* Implements Eltwise shape inference algorithm. The algorithm is based on broadcasting all the input shapes
* according to the NUMPY broadcast rule. This implementation is more lightweight than the ngraph one.
*
*/
class EltwiseShapeInfer : public ShapeInferEmptyPads {
public:
Result infer(
const std::vector<std::reference_wrapper<const VectorDims>>& input_shapes,
const std::unordered_map<size_t, MemoryPtr>& data_dependency) override {
size_t max_rank = 0;
size_t max_rank_idx = 0;
for (size_t i = 0; i < input_shapes.size(); ++i) {
auto item_rank = input_shapes[i].get().size();
if (item_rank > max_rank) {
max_rank = item_rank;
max_rank_idx = i;
}
}
auto output_shape = input_shapes[max_rank_idx].get();
// use NUMPY broadcast rule
for (size_t i = 0; i < input_shapes.size(); i++) {
if (i == max_rank_idx)
continue;
auto& input_shape = input_shapes[i].get();
if (input_shape.size() > output_shape.size()) {
IE_THROW() << "Eltwise shape infer input and output shapes rank mismatch";
}
size_t offset = output_shape.size() - input_shape.size();
for (size_t j = 0; j < input_shape.size(); ++j) {
if (input_shape[j] != output_shape[offset + j]) {
if (output_shape[offset + j] == 1) {
output_shape[offset + j] = input_shape[j];
} else {
if (input_shape[j] != 1) IE_THROW() << "Eltwise shape infer input shapes dim index: " << j << " mismatch";
}
}
}
}
return { { std::move(output_shape) }, ShapeInferStatus::success };
}
port_mask_t get_port_mask() const override {
return EMPTY_PORT_MASK;
}
};
class EltwiseShapeInferFactory : public ShapeInferFactory {
public:
ShapeInferPtr makeShapeInfer() const override {
return std::make_shared<EltwiseShapeInfer>();
}
};
} // namespace
Eltwise::BroadcastingPolicy Eltwise::determineBroadcastingPolicy(const std::shared_ptr<ngraph::Node>& op) {
const auto const1 = ov::as_type_ptr<ngraph::opset1::Constant>(op->get_input_node_shared_ptr(0));
const auto const2 = ov::as_type_ptr<ngraph::opset1::Constant>(op->get_input_node_shared_ptr(1));
@ -1088,23 +1094,24 @@ const std::map<const ngraph::DiscreteTypeInfo, Eltwise::Initializer> Eltwise::in
node.beta = 0.0f;
}},
{ngraph::op::v0::Gelu::get_type_info_static(), [](const std::shared_ptr<ngraph::Node>& op, Eltwise& node) {
node.algorithm = Algorithm::EltwiseGelu;
node.algorithm = Algorithm::EltwiseGeluErf;
node.onednnAlgorithm = dnnl::algorithm::eltwise_gelu_erf;
}},
{ngraph::op::v7::Gelu::get_type_info_static(), [](const std::shared_ptr<ngraph::Node>& op, Eltwise& node) {
auto gelu = getNgraphOpAs<ngraph::op::v7::Gelu>(op);
node.algorithm = Algorithm::EltwiseGelu;
ngraph::op::GeluApproximationMode approximationMode = gelu->get_approximation_mode();
if (approximationMode == ngraph::op::GeluApproximationMode::ERF)
if (approximationMode == ngraph::op::GeluApproximationMode::ERF) {
node.algorithm = Algorithm::EltwiseGeluErf;
node.onednnAlgorithm = dnnl::algorithm::eltwise_gelu_erf;
else if (approximationMode == ngraph::op::GeluApproximationMode::TANH)
} else if (approximationMode == ngraph::op::GeluApproximationMode::TANH) {
node.algorithm = Algorithm::EltwiseGeluTanh;
node.onednnAlgorithm = dnnl::algorithm::eltwise_gelu_tanh;
else
} else {
IE_THROW(NotImplemented) << "CPU Eltwise node doesn't support ngraph operation Gelu with approximation mode: " << approximationMode;
}
}},
{ngraph::op::v0::Elu::get_type_info_static(), [](const std::shared_ptr<ngraph::Node>& op, Eltwise& node) {
auto eluOp = getNgraphOpAs<ngraph::op::v0::Elu>(op);
node.alpha = static_cast<float>(eluOp->get_alpha());
node.algorithm = Algorithm::EltwiseElu;
node.onednnAlgorithm = dnnl::algorithm::eltwise_elu;
@ -1197,6 +1204,9 @@ const std::map<const ngraph::DiscreteTypeInfo, Eltwise::Initializer> Eltwise::in
{ngraph::op::v1::Select::get_type_info_static(), [](const std::shared_ptr<ngraph::Node>& op, Eltwise& node) {
node.algorithm = Algorithm::EltwiseSelect;
}},
{ngraph::op::v0::Log::get_type_info_static(), [](const std::shared_ptr<ngraph::Node>& op, Eltwise& node) {
node.algorithm = Algorithm::EltwiseLog;
}},
};
@ -1503,6 +1513,7 @@ public:
std::transform(jep.oc_offsets.begin(), jep.oc_offsets.end(), jep.oc_offsets.begin(),
[](size_t& offset) { return offset * sizeof(float);});
#if defined(OPENVINO_ARCH_X86_64)
if (mayiuse(x64::avx512_core)) {
_pKernel.reset(new jit_uni_eltwise_generic<x64::avx512_core>(jep, eltwise_data, ops_list, post_ops));
} else if (mayiuse(x64::avx2)) {
@ -1512,7 +1523,7 @@ public:
} else {
IE_THROW() << "Can't create jit eltwise kernel";
}
#endif // OPENVINO_ARCH_X86_64
if (_pKernel)
_pKernel->create_ker();
}
@ -1629,6 +1640,15 @@ public:
}
void exec(const jit_eltwise_call_args_ptrs &args_ptrs, const VectorDims &dims_out) override {
if (_opData.algo == Algorithm::EltwiseLog) {
const float* src_ptr_f = reinterpret_cast<const float*>(args_ptrs.src_ptr[0]);
float* dst_ptr_f = reinterpret_cast<float*>(args_ptrs.dst_ptr);
parallel_for(_fullWorkAmount, [&](size_t i) {
dst_ptr_f[i] = logf(src_ptr_f[i]);
});
return;
}
std::shared_ptr<ref_eltwise_scalar_fwd_t> ref_eltwise_injector = nullptr;
if (_opData.onednnAlgorithm != dnnl::algorithm::undef) {
ref_eltwise_injector = std::make_shared<ref_eltwise_scalar_fwd_t>(
@ -1671,7 +1691,8 @@ public:
switch (_opData.algo) {
case Algorithm::EltwiseRelu:
case Algorithm::EltwiseGelu:
case Algorithm::EltwiseGeluErf:
case Algorithm::EltwiseGeluTanh:
case Algorithm::EltwiseElu:
case Algorithm::EltwiseTanh:
case Algorithm::EltwiseSigmoid:
@ -1816,7 +1837,8 @@ size_t Eltwise::getOpInputsNum() const {
case Algorithm::EltwiseIsInf:
case Algorithm::EltwiseIsNaN:
case Algorithm::EltwiseRelu:
case Algorithm::EltwiseGelu:
case Algorithm::EltwiseGeluErf:
case Algorithm::EltwiseGeluTanh:
case Algorithm::EltwiseElu:
case Algorithm::EltwiseTanh:
case Algorithm::EltwiseSigmoid:
@ -1835,6 +1857,7 @@ size_t Eltwise::getOpInputsNum() const {
case Algorithm::EltwiseRoundHalfToEven:
case Algorithm::EltwiseRoundHalfAwayFromZero:
case Algorithm::EltwiseSoftSign:
case Algorithm::EltwiseLog:
return 1;
case Algorithm::EltwiseAdd:
case Algorithm::EltwiseSubtract:
@ -1899,6 +1922,8 @@ void Eltwise::initSupportedPrimitiveDescriptors() {
// if dim rank is greater than the maximum possible, we should use the reference execution
bool canUseOptimizedImpl = mayiuse(x64::sse41) && getInputShapeAtPort(0).getRank() <= MAX_ELTWISE_DIM_RANK;
// TODO: Add EltwiseLog algorithm support for JIT implementation
canUseOptimizedImpl &= !one_of(getAlgorithm(), Algorithm::EltwiseLog);
bool canUseOptimizedShapeAgnosticImpl = isDynamicNode() && canUseOptimizedImpl;
if (!canUseOptimizedImpl && !fusedWith.empty()) {
@ -1992,7 +2017,7 @@ void Eltwise::initSupportedPrimitiveDescriptors() {
Blocked
};
auto initDesc = [&] (LayoutType lt) -> NodeDesc {
auto initDesc = [&] (LayoutType lt, bool useAclExecutor = false) -> NodeDesc {
auto createMemoryDesc = [lt](const Shape &shape, Precision prc, size_t offset) -> std::shared_ptr<CpuBlockedMemoryDesc> {
const auto &dims = shape.getDims();
if (lt == ChannelsFirst && shape.getRank() != 1) {
@ -2072,18 +2097,36 @@ void Eltwise::initSupportedPrimitiveDescriptors() {
config.outConfs.push_back(portConfig);
impl_desc_type impl_type;
if (mayiuse(x64::avx512_core)) {
impl_type = impl_desc_type::jit_avx512;
} else if (mayiuse(x64::avx2)) {
impl_type = impl_desc_type::jit_avx2;
} else if (mayiuse(x64::sse41)) {
impl_type = impl_desc_type::jit_sse42;
} else {
impl_type = impl_desc_type::ref;
}
if (useAclExecutor) {
impl_desc_type impl_type = impl_desc_type::undef;
return {config, impl_type};
std::vector<MemoryDescPtr> srcMemoryDescs;
for (int i = 0; i < config.inConfs.size(); i++) {
srcMemoryDescs.push_back(config.inConfs[i].getMemDesc());
}
std::vector<MemoryDescPtr> dstMemoryDescs;
for (int i = 0; i < config.outConfs.size(); i++) {
dstMemoryDescs.push_back(config.outConfs[i].getMemDesc());
}
auto factory = std::make_shared<EltwiseExecutorFactory>(eltwiseAttrs, srcMemoryDescs, dstMemoryDescs,
std::make_shared<ExecutorContext>(context, getPrimitivesPriority()));
return {config, impl_type, !factory->isEmpty() ? factory : nullptr};
} else {
impl_desc_type impl_type = impl_desc_type::ref;
if (canUseOptimizedImpl) {
if (mayiuse(x64::avx512_core)) {
impl_type = impl_desc_type::jit_avx512;
} else if (mayiuse(x64::avx2)) {
impl_type = impl_desc_type::jit_avx2;
} else if (mayiuse(x64::sse41)) {
impl_type = impl_desc_type::jit_sse42;
}
}
return {config, impl_type};
}
};
bool isChannelsFirstApplicable = one_of(getOutputShapeAtPort(0).getRank(), 1u, 2u, 3u, 4u, 5u);
@ -2105,14 +2148,31 @@ void Eltwise::initSupportedPrimitiveDescriptors() {
isBlockedApplicable = isBlockedApplicable && inShape.getMinDims()[1] != Shape::UNDEFINED_DIM && inShape.getMinDims()[1] > 1;
}
inputNum = getParentEdges().size();
currentInBlkDims.resize(inputNum);
#if defined (OV_CPU_WITH_ACL)
eltwiseAttrs = {algorithm, alpha, beta, gamma};
if (isChannelsFirstApplicable) {
auto channelFirstDesc = initDesc(ChannelsFirst, true);
if (channelFirstDesc.getExecutorFactory())
supportedPrimitiveDescriptors.emplace_back(channelFirstDesc);
}
auto planarDesc = initDesc(Planar, true);
if (planarDesc.getExecutorFactory())
supportedPrimitiveDescriptors.emplace_back(planarDesc);
canUseAclExecutor = !supportedPrimitiveDescriptors.empty();
if (canUseAclExecutor)
return;
#endif
if (isChannelsFirstApplicable)
supportedPrimitiveDescriptors.emplace_back(initDesc(ChannelsFirst));
if (isBlockedApplicable)
supportedPrimitiveDescriptors.emplace_back(initDesc(Blocked));
supportedPrimitiveDescriptors.emplace_back(initDesc(Planar));
inputNum = getParentEdges().size();
currentInBlkDims.resize(inputNum);
}
void Eltwise::createPrimitive() {
@ -2141,6 +2201,21 @@ void Eltwise::createPrimitive() {
}
void Eltwise::prepareParams() {
if (canUseAclExecutor) {
std::vector<MemoryDescPtr> srcMemoryDescs;
for (int i = 0; i < getParentEdges().size(); i++) {
srcMemoryDescs.push_back(getParentEdgeAt(i)->getMemoryPtr()->getDescPtr());
}
std::vector<MemoryDescPtr> dstMemoryDescs;
dstMemoryDescs.push_back(getChildEdgeAt(0)->getMemoryPtr()->getDescPtr());
auto selectedPD = getSelectedPrimitiveDescriptor();
aclExecPtr = selectedPD->getExecutorFactoryAs<EltwiseExecutorFactory>()->makeExecutor(eltwiseAttrs, srcMemoryDescs, dstMemoryDescs, {});
selectedPD->setImplementationType(aclExecPtr->getImplType());
return;
}
auto outBlockingDesc = getChildEdgeAt(0)->getMemory().GetDescWithType<BlockedMemoryDesc>();
const auto &outOrder = outBlockingDesc->getOrder();
const auto &currentOutBlkDims = outBlockingDesc->getBlockDims();
@ -2309,6 +2384,15 @@ void Eltwise::execute(dnnl::stream strm) {
}
execPtr->exec(args_ptrs, dims_out);
} else if (aclExecPtr) {
std::vector<MemoryCPtr> srcMemory;
for (int i = 0; i < getParentEdges().size(); i++) {
srcMemory.push_back(getParentEdgeAt(i)->getMemoryPtr());
}
std::vector<MemoryPtr> dstMemory;
dstMemory.push_back(getChildEdgeAt(0)->getMemoryPtr());
aclExecPtr->exec(srcMemory, dstMemory, fqDataPtrs.data());
} else {
IE_THROW() << "Can't execute eltwise node with name: " << getName() << ". Primitive isn't created";
}
@ -2594,6 +2678,9 @@ bool Eltwise::canFuse(const NodePtr& node) const {
if (!mayiuse(x64::sse41) || getInputShapeAtPort(0).getRank() > MAX_ELTWISE_DIM_RANK)
return false;
// TODO: EltwiseLog is supported only via reference executor
if (getAlgorithm() == Algorithm::EltwiseLog || node->getAlgorithm() == Algorithm::EltwiseLog)
return false;
bool isIntegerNode = isIntegerComputeSupported(this);
if (isIntegerNode && node->getType() != Type::Eltwise)
@ -2669,4 +2756,4 @@ InferenceEngine::Precision Eltwise::getRuntimePrecision() const {
} // namespace node
} // namespace intel_cpu
} // namespace ov
} // namespace ov

View File

@ -10,6 +10,7 @@
#include <vector>
#include <memory>
#include <caseless.hpp>
#include "executors/eltwise_list.hpp"
namespace ov {
namespace intel_cpu {
@ -199,6 +200,10 @@ private:
void appendMemory(const std::vector<float> &data, MemoryPtr &memPtr, std::vector<MemoryPtr>& postOpsMem);
void appendMemory(const std::vector<float> &data, MemoryPtr &memPtr, std::vector<const void*>& postOpsMem);
bool canUseAclExecutor = false;
EltwiseAttrs eltwiseAttrs;
std::shared_ptr<EltwiseExecutor> aclExecPtr = nullptr;
};
class eltwise_precision_helper {
@ -213,4 +218,4 @@ private:
} // namespace node
} // namespace intel_cpu
} // namespace ov
} // namespace ov

View File

@ -0,0 +1,390 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "acl_eltwise.hpp"
#include "acl_utils.hpp"
namespace ov {
namespace intel_cpu {
using namespace arm_compute;
inline VectorDims reshape_sizes(VectorDims dims) {
const size_t MAX_NUM_SHAPE = arm_compute::MAX_DIMS;
VectorDims result_dims(MAX_NUM_SHAPE - 1);
if (dims.size() >= MAX_NUM_SHAPE) {
for (int i = 0; i < MAX_NUM_SHAPE - 1; i++) {
result_dims[i] = dims[i];
}
for (int i = MAX_NUM_SHAPE - 1; i < dims.size(); i++) {
result_dims[MAX_NUM_SHAPE - 2] *= dims[i];
}
} else {
result_dims = dims;
}
return result_dims;
}
AclEltwiseExecutor::AclEltwiseExecutor(const ExecutorContext::CPtr context) : EltwiseExecutor(context) {}
bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vector<MemoryDescPtr> &srcDescs,
const std::vector<MemoryDescPtr> &dstDescs,
const std::vector<EltwisePostOp> &postOps) {
if (!postOps.empty()) { return false; }
aclEltwiseAttrs = eltwiseAttrs;
std::vector<arm_compute::TensorShape> srcVecDims(srcDescs.size()), dstVecDims(dstDescs.size());
std::vector<arm_compute::DataLayout> srcDataLayout(srcDescs.size()), dstDataLayout(dstDescs.size());
std::vector<arm_compute::TensorInfo> srcTensorsInfo(srcDescs.size()), dstTensorsInfo(dstDescs.size());
srcTensors = std::vector<arm_compute::Tensor>(srcDescs.size());
dstTensors = std::vector<arm_compute::Tensor>(dstDescs.size());
for (int i = 0; i < srcVecDims.size(); i++) {
srcVecDims[i] = shapeCast(reshape_sizes(srcDescs[i]->getShape().getDims()));
}
for (int i = 0; i < dstVecDims.size(); i++) {
dstVecDims[i] = shapeCast(reshape_sizes(dstDescs[i]->getShape().getDims()));
}
for (int i = 0; i < srcDescs.size(); i++) {
srcDataLayout[i] = getAclDataLayoutByMemoryDesc(srcDescs[i]);
if (srcDataLayout[i] == arm_compute::DataLayout::UNKNOWN) { return false; }
}
for (int i = 0; i < dstDescs.size(); i++) {
dstDataLayout[i] = getAclDataLayoutByMemoryDesc(dstDescs[i]);
if (dstDataLayout[i] == arm_compute::DataLayout::UNKNOWN) { return false; }
}
if (srcDescs.size() == 2 &&
srcDescs[0]->hasLayoutType(LayoutType::nspc) && srcDescs[1]->hasLayoutType(LayoutType::nspc) &&
srcDescs[0]->getShape().getDims() != srcDescs[1]->getShape().getDims()) {
auto dim_size = srcDescs[0]->getShape().getDims().size();
auto mover = [&dim_size](TensorShape &_shape) {
if (dim_size == 5) { std::swap(_shape[2], _shape[3]); }
std::swap(_shape[1], _shape[2]);
std::swap(_shape[0], _shape[1]);
};
if (dim_size < 5) {
srcDataLayout[0] = srcDataLayout[1] = dstDataLayout[0] = DataLayout::NCHW;
} else {
srcDataLayout[0] = srcDataLayout[1] = dstDataLayout[0] = DataLayout::NCDHW;
}
mover(srcVecDims[0]);
mover(srcVecDims[1]);
mover(dstVecDims[0]);
}
for (int i = 0; i < srcVecDims.size(); i++) {
srcTensorsInfo[i] = TensorInfo(srcVecDims[i], 1,
precisionToAclDataType(srcDescs[i]->getPrecision()),
srcDataLayout[i]);
srcTensors[i].allocator()->init(srcTensorsInfo[i]);
}
for (int i = 0; i < dstVecDims.size(); i++) {
dstTensorsInfo[i] = TensorInfo(dstVecDims[i], 1,
precisionToAclDataType(dstDescs[i]->getPrecision()),
dstDataLayout[i]);
dstTensors[i].allocator()->init(dstTensorsInfo[i]);
}
switch (aclEltwiseAttrs.algorithm) {
case Algorithm::EltwiseAdd:
if (!NEArithmeticAddition::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ConvertPolicy::SATURATE))
return false;
exec_func = [this]{
auto acl_op = std::make_unique<NEArithmeticAddition>();
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ConvertPolicy::SATURATE);
acl_op->run();
};
break;
case Algorithm::EltwiseMultiply:
if (!NEPixelWiseMultiplication::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0],
1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO))
return false;
exec_func = [this]{
auto acl_op = std::make_unique<NEPixelWiseMultiplication>();
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
acl_op->run();
};
break;
case Algorithm::EltwiseSubtract:
if (!NEArithmeticSubtraction::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ConvertPolicy::SATURATE))
return false;
exec_func = [this]{
auto acl_op = std::make_unique<NEArithmeticSubtraction>();
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ConvertPolicy::SATURATE);
acl_op->run();
};
break;
case Algorithm::EltwiseDivide:
if (!NEElementwiseDivision::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0]))
return false;
exec_func = [this]{
auto acl_op = std::make_unique<NEElementwiseDivision>();
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]);
acl_op->run();
};
break;
case Algorithm::EltwiseMaximum:
if (!NEElementwiseMax::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0]))
return false;
exec_func = [this]{
auto acl_op = std::make_unique<NEElementwiseMax>();
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]);
acl_op->run();
};
break;
case Algorithm::EltwiseMinimum:
if (!NEElementwiseMin::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0]))
return false;
exec_func = [this]{
auto acl_op = std::make_unique<NEElementwiseMin>();
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]);
acl_op->run();
};
break;
case Algorithm::EltwiseSquaredDifference:
if (!NEElementwiseSquaredDiff::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0]))
return false;
exec_func = [this]{
auto acl_op = std::make_unique<NEElementwiseSquaredDiff>();
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]);
acl_op->run();
};
break;
case Algorithm::EltwisePowerDynamic:
if (!NEElementwisePower::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0]))
return false;
exec_func = [this]{
auto acl_op = std::make_unique<NEElementwisePower>();
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]);
acl_op->run();
};
break;
case Algorithm::EltwiseEqual:
if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ComparisonOperation::Equal))
return false;
exec_func = [this]{
auto acl_op = std::make_unique<NEElementwiseComparison>();
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::Equal);
acl_op->run();
};
break;
case Algorithm::EltwiseNotEqual:
if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ComparisonOperation::NotEqual))
return false;
exec_func = [this]{
auto acl_op = std::make_unique<NEElementwiseComparison>();
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::NotEqual);
acl_op->run();
};
break;
case Algorithm::EltwiseGreater:
if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ComparisonOperation::Greater))
return false;
exec_func = [this]{
auto acl_op = std::make_unique<NEElementwiseComparison>();
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::Greater);
acl_op->run();
};
break;
case Algorithm::EltwiseGreaterEqual:
if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ComparisonOperation::GreaterEqual))
return false;
exec_func = [this]{
auto acl_op = std::make_unique<NEElementwiseComparison>();
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::GreaterEqual);
acl_op->run();
};
break;
case Algorithm::EltwiseLess:
if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ComparisonOperation::Less))
return false;
exec_func = [this]{
auto acl_op = std::make_unique<NEElementwiseComparison>();
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::Less);
acl_op->run();
};
break;
case Algorithm::EltwiseLessEqual:
if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ComparisonOperation::LessEqual))
return false;
exec_func = [this]{
auto acl_op = std::make_unique<NEElementwiseComparison>();
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::LessEqual);
acl_op->run();
};
break;
case Algorithm::EltwiseRelu:
if (aclEltwiseAttrs.alpha == 0) {
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
ActivationLayerInfo::ActivationFunction::RELU))
return false;
} else {
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
{ActivationLayerInfo::ActivationFunction::LEAKY_RELU, aclEltwiseAttrs.alpha}))
return false;
}
exec_func = [this]{
auto acl_op = std::make_unique<NEActivationLayer>();
if (aclEltwiseAttrs.alpha == 0) {
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::RELU);
} else {
acl_op->configure(&srcTensors[0], &dstTensors[0],
{ActivationLayerInfo::ActivationFunction::LEAKY_RELU, aclEltwiseAttrs.alpha});
}
acl_op->run();
};
break;
case Algorithm::EltwiseGeluErf:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::GELU))
return false;
exec_func = [this]{
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::GELU);
acl_op->run();
};
break;
case Algorithm::EltwiseElu:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
{ActivationLayerInfo::ActivationFunction::ELU, aclEltwiseAttrs.alpha}))
return false;
exec_func = [this]{
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0], {ActivationLayerInfo::ActivationFunction::ELU, aclEltwiseAttrs.alpha});
acl_op->run();
};
break;
case Algorithm::EltwiseTanh:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
{ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f}))
return false;
exec_func = [this]{
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0],
{ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f});
acl_op->run();
};
break;
case Algorithm::EltwiseSigmoid:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::LOGISTIC))
return false;
exec_func = [this]{
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::LOGISTIC);
acl_op->run();
};
break;
case Algorithm::EltwiseAbs:
if (!NEAbsLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0]))
return false;
exec_func = [this]{
auto acl_op = std::make_unique<NEAbsLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0]);
acl_op->run();
};
break;
case Algorithm::EltwiseSqrt:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::SQRT))
return false;
exec_func = [this]{
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::SQRT);
acl_op->run();
};
break;
case Algorithm::EltwiseSoftRelu:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::SOFT_RELU))
return false;
exec_func = [this]{
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::SOFT_RELU);
acl_op->run();
};
break;
case Algorithm::EltwiseExp:
if (!NEExpLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0]))
return false;
exec_func = [this]{
auto acl_op = std::make_unique<NEExpLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0]);
acl_op->run();
};
break;
case Algorithm::EltwiseClamp:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, aclEltwiseAttrs.beta, aclEltwiseAttrs.alpha}))
return false;
exec_func = [this]{
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0],
{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, aclEltwiseAttrs.beta, aclEltwiseAttrs.alpha});
acl_op->run();
};
break;
case Algorithm::EltwiseSwish:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
{ActivationLayerInfo::ActivationFunction::SWISH, aclEltwiseAttrs.beta}))
return false;
exec_func = [this]{
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0],
{ActivationLayerInfo::ActivationFunction::SWISH, aclEltwiseAttrs.alpha});
acl_op->run();
};
break;
case Algorithm::EltwisePrelu:
if (!NEPReluLayer::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0]))
return false;
exec_func = [this]{
auto acl_op = std::make_unique<NEPReluLayer>();
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]);
acl_op->run();
};
break;
case Algorithm::EltwiseHswish:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::HARD_SWISH))
return false;
exec_func = [this]{
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::HARD_SWISH);
acl_op->run();
};
break;
case Algorithm::EltwiseLog:
if (!NELogLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0]))
return false;
exec_func = [this]{
auto acl_op = std::make_unique<NELogLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0]);
acl_op->run();
};
break;
default:
IE_THROW() << "Unsupported operation type for ACL Eltwise executor: " << static_cast<int>(aclEltwiseAttrs.algorithm);
}
return true;
}
void AclEltwiseExecutor::exec(const std::vector<MemoryCPtr> &src, const std::vector<MemoryPtr> &dst,
const void *post_ops_data_) {
for (int i = 0; i < src.size(); i++) {
srcTensors[i].allocator()->import_memory(src[i]->GetPtr());
}
for (int i = 0; i < dst.size(); i++) {
dstTensors[i].allocator()->import_memory(dst[i]->GetPtr());
}
exec_func();
for (int i = 0; i < src.size(); i++) {
srcTensors[i].allocator()->free();
}
for (int i = 0; i < dst.size(); i++) {
dstTensors[i].allocator()->free();
}
}
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,110 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "../eltwise.hpp"
#include "arm_compute/runtime/NEON/NEFunctions.h"
#include "acl_utils.hpp"
namespace ov {
namespace intel_cpu {
class AclEltwiseExecutor : public EltwiseExecutor {
public:
AclEltwiseExecutor(const ExecutorContext::CPtr context);
bool init(const EltwiseAttrs& eltwiseAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const std::vector<EltwisePostOp>& postOps) override;
void exec(const std::vector<MemoryCPtr>& src,
const std::vector<MemoryPtr>& dst,
const void *post_ops_data_) override;
impl_desc_type getImplType() const override {
return implType;
}
private:
EltwiseAttrs aclEltwiseAttrs{};
impl_desc_type implType = impl_desc_type::acl;
std::vector<arm_compute::Tensor> srcTensors, dstTensors;
std::function<void()> exec_func;
};
class AclEltwiseExecutorBuilder : public EltwiseExecutorBuilder {
public:
bool isSupported(const EltwiseAttrs& eltwiseAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs) const override {
switch (eltwiseAttrs.algorithm) {
case Algorithm::EltwiseAdd:
case Algorithm::EltwiseMultiply:
case Algorithm::EltwiseSubtract:
case Algorithm::EltwiseDivide:
case Algorithm::EltwiseMaximum:
case Algorithm::EltwiseMinimum:
case Algorithm::EltwiseSquaredDifference:
case Algorithm::EltwisePowerDynamic:
case Algorithm::EltwiseEqual:
case Algorithm::EltwiseNotEqual:
case Algorithm::EltwiseGreater:
case Algorithm::EltwiseGreaterEqual:
case Algorithm::EltwiseLess:
case Algorithm::EltwiseLessEqual:
case Algorithm::EltwiseRelu:
case Algorithm::EltwiseGeluErf:
case Algorithm::EltwiseElu:
case Algorithm::EltwiseTanh:
case Algorithm::EltwiseSigmoid:
case Algorithm::EltwiseAbs:
case Algorithm::EltwiseSqrt:
case Algorithm::EltwiseSoftRelu:
case Algorithm::EltwiseExp:
case Algorithm::EltwiseClamp:
case Algorithm::EltwiseSwish:
case Algorithm::EltwisePrelu:
case Algorithm::EltwiseHswish:
case Algorithm::EltwiseLog:
break;
default:
return false;
}
// ACL supports only U8 precision on output for comparison operations
if (one_of(eltwiseAttrs.algorithm, Algorithm::EltwiseEqual, Algorithm::EltwiseNotEqual, Algorithm::EltwiseGreater,
Algorithm::EltwiseGreaterEqual, Algorithm::EltwiseLess, Algorithm::EltwiseLessEqual)) {
if (dstDescs[0]->getPrecision() != InferenceEngine::Precision::U8) {
return false;
}
}
for (const auto &srcD : srcDescs) {
for (const auto &dstD : dstDescs) {
if ((srcD->getPrecision() != InferenceEngine::Precision::FP32 &&
srcD->getPrecision() != InferenceEngine::Precision::FP16) ||
srcD->getPrecision() != dstD->getPrecision())
return false;
}
}
for (int i = 0; i < srcDescs.size(); i++) {
if (getAclDataLayoutByMemoryDesc(srcDescs[i]) == arm_compute::DataLayout::UNKNOWN)
return false;
}
for (int i = 0; i < dstDescs.size(); i++) {
if (getAclDataLayoutByMemoryDesc(dstDescs[i]) == arm_compute::DataLayout::UNKNOWN)
return false;
}
return true;
}
EltwiseExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const override {
return std::make_shared<AclEltwiseExecutor>(context);
}
};
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,185 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "acl_interpolate.hpp"
#include "acl_utils.hpp"
static arm_compute::TensorShape interpolateShapeCast(const ov::intel_cpu::VectorDims& dims) {
arm_compute::TensorShape tensorShape;
for (std::size_t i = 0; i < dims.size(); ++i) {
tensorShape.set(dims.size() - i - 1, dims[i], false);
}
if (tensorShape.num_dimensions() == 0) {
tensorShape.set(0, 1, false);
tensorShape.set_num_dimensions(1);
}
return tensorShape;
}
bool ov::intel_cpu::ACLInterpolateExecutor::init(const InterpolateAttrs &interpolateAttrs,
const std::vector <MemoryDescPtr> &srcDescs,
const std::vector <MemoryDescPtr> &dstDescs,
const dnnl::primitive_attr &attr) {
InterpolateExecutor::init(interpolateAttrs, srcDescs, dstDescs, attr);
aclInterpolateAttrs = interpolateAttrs;
auto& coord_mode = aclInterpolateAttrs.coordTransMode;
auto& inter_mode = aclInterpolateAttrs.mode;
acl_coord = arm_compute::SamplingPolicy::TOP_LEFT;
auto& out_shape = dstDescs[0]->getShape().getDims();
if ((coord_mode == InterpolateCoordTransMode::pytorch_half_pixel && out_shape[2] > 1 && out_shape[3] > 1) ||
coord_mode == InterpolateCoordTransMode::half_pixel) {
acl_coord = arm_compute::SamplingPolicy::CENTER;
}
switch (inter_mode) {
case InterpolateMode::linear:
case InterpolateMode::linear_onnx:
acl_policy = arm_compute::InterpolationPolicy::BILINEAR;
break;
case InterpolateMode::nearest:
acl_policy = arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR;
break;
default:
return false;
}
auto srcDims = srcDescs[0]->getShape().getStaticDims();
auto dstDims = dstDescs[0]->getShape().getStaticDims();
auto srcTensorInfo = arm_compute::TensorInfo(interpolateShapeCast(srcDims), 1,
precisionToAclDataType(srcDescs[0]->getPrecision()),
getAclDataLayoutByMemoryDesc(srcDescs[0]));
auto dstTensorInfo = arm_compute::TensorInfo(interpolateShapeCast(dstDims), 1,
precisionToAclDataType(dstDescs[0]->getPrecision()),
getAclDataLayoutByMemoryDesc(dstDescs[0]));
if (!arm_compute::NEScale::validate(&srcTensorInfo,
&dstTensorInfo,
arm_compute::ScaleKernelInfo(acl_policy,
arm_compute::BorderMode::REPLICATE,
arm_compute::PixelValue(),
acl_coord,
false,
coord_mode == InterpolateCoordTransMode::align_corners)))
return false;
srcTensor.allocator()->init(srcTensorInfo);
dstTensor.allocator()->init(dstTensorInfo);
acl_scale = std::make_unique<arm_compute::NEScale>();
acl_scale->configure(&srcTensor, &dstTensor, arm_compute::ScaleKernelInfo(acl_policy,
arm_compute::BorderMode::REPLICATE,
arm_compute::PixelValue(),
acl_coord,
false,
aclInterpolateAttrs.coordTransMode == InterpolateCoordTransMode::align_corners));
return true;
}
void ov::intel_cpu::ACLInterpolateExecutor::exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, const void *post_ops_data_) {
auto in_ptr_ = padPreprocess(src, dst);
srcTensor.allocator()->import_memory(const_cast<void *>(reinterpret_cast<const void *>(in_ptr_)));
dstTensor.allocator()->import_memory(dst[0]->GetPtr());
acl_scale->run();
srcTensor.allocator()->free();
dstTensor.allocator()->free();
}
bool ov::intel_cpu::ACLInterpolateExecutorBuilder::isSupportedConfiguration(
const ov::intel_cpu::InterpolateAttrs &interpolateAttrs, const std::vector<MemoryDescPtr> &srcDescs,
const std::vector<MemoryDescPtr> &dstDescs) {
auto& inp_shape = srcDescs[0]->getShape().getDims();
auto& out_shape = dstDescs[0]->getShape().getDims();
float scale_h = static_cast<float>(out_shape[2]) / inp_shape[2];
float scale_w = static_cast<float>(out_shape[3]) / inp_shape[3];
bool is_upsample = scale_h > 1 && scale_w > 1;
auto& coord_mode = interpolateAttrs.coordTransMode;
auto& nearest_mode = interpolateAttrs.nearestMode;
if (coord_mode == InterpolateCoordTransMode::asymmetric &&
nearest_mode == InterpolateNearestMode::floor) {
return is_upsample;
}
if (coord_mode == InterpolateCoordTransMode::align_corners &&
nearest_mode == InterpolateNearestMode::round_prefer_ceil) {
return true;
}
if (coord_mode == InterpolateCoordTransMode::half_pixel &&
(nearest_mode == InterpolateNearestMode::simple || nearest_mode == InterpolateNearestMode::round_prefer_ceil)) {
return false;
}
if (coord_mode == InterpolateCoordTransMode::asymmetric &&
(nearest_mode == InterpolateNearestMode::simple || nearest_mode == InterpolateNearestMode::floor)) {
return is_upsample;
}
if (is_upsample) {
bool int_factor = scale_h == static_cast<int>(scale_h) && scale_w == static_cast<int>(scale_w);
if (int_factor && coord_mode != InterpolateCoordTransMode::asymmetric &&
(nearest_mode == InterpolateNearestMode::round_prefer_ceil
|| nearest_mode == InterpolateNearestMode::round_prefer_floor)) {
return true;
}
} else if (scale_h < 1 && scale_w < 1) {
float down_scale_h = static_cast<float>(inp_shape[2]) / out_shape[2];
float down_scale_w = static_cast<float>(inp_shape[3]) / out_shape[3];
bool int_factor = down_scale_h == static_cast<int>(down_scale_h) && down_scale_w == static_cast<int>(down_scale_w);
if (int_factor && coord_mode != InterpolateCoordTransMode::align_corners &&
nearest_mode == InterpolateNearestMode::simple) {
return true;
}
if (int_factor && nearest_mode == InterpolateNearestMode::round_prefer_ceil &&
((out_shape[2] > 1 && out_shape[3] > 1) || coord_mode != InterpolateCoordTransMode::half_pixel)) {
return true;
}
}
return false;
}
bool ov::intel_cpu::ACLInterpolateExecutorBuilder::isSupported(const ov::intel_cpu::InterpolateAttrs &interpolateAttrs,
const std::vector<MemoryDescPtr> &srcDescs,
const std::vector<MemoryDescPtr> &dstDescs) const {
if (srcDescs[0]->getShape().getDims().size() != 4) {
return false;
}
auto& pads_begin = interpolateAttrs.padBegin;
auto& pads_end = interpolateAttrs.padEnd;
if (!std::all_of(pads_begin.begin(), pads_begin.end(), [](int i){return i == 0;}) ||
!std::all_of(pads_end.begin(), pads_end.end(), [](int i){return i == 0;})) {
return false;
}
auto& nearest_mode = interpolateAttrs.nearestMode;
auto& coord_mode = interpolateAttrs.coordTransMode;
if (interpolateAttrs.antialias ||
coord_mode == InterpolateCoordTransMode::tf_half_pixel_for_nn ||
nearest_mode == InterpolateNearestMode::ceil) {
return false;
}
if (interpolateAttrs.mode == InterpolateMode::cubic) {
return false;
}
if (interpolateAttrs.mode == InterpolateMode::nearest &&
!isSupportedConfiguration(interpolateAttrs, srcDescs, dstDescs)) {
return false;
}
if (coord_mode == InterpolateCoordTransMode::pytorch_half_pixel) {
return false;
}
return true;
}

View File

@ -0,0 +1,52 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "../interpolate.hpp"
namespace ov {
namespace intel_cpu {
class ACLInterpolateExecutor : public InterpolateExecutor {
public:
ACLInterpolateExecutor(const ExecutorContext::CPtr context) : InterpolateExecutor(context) {}
bool init(const InterpolateAttrs& interpolateAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const dnnl::primitive_attr &attr) override;
void exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, const void *post_ops_data_) override;
impl_desc_type getImplType() const override {
return implType;
}
private:
impl_desc_type implType = impl_desc_type::acl;
InterpolateAttrs aclInterpolateAttrs;
arm_compute::SamplingPolicy acl_coord;
arm_compute::InterpolationPolicy acl_policy;
bool antialias{};
arm_compute::Tensor srcTensor, dstTensor;
std::unique_ptr<arm_compute::NEScale> acl_scale;
};
class ACLInterpolateExecutorBuilder : public InterpolateExecutorBuilder {
public:
bool isSupported(const InterpolateAttrs& interpolateAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs) const override;
InterpolateExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const override {
return std::make_shared<ACLInterpolateExecutor>(context);
}
private:
static bool isSupportedConfiguration(const InterpolateAttrs& interpolateAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs);
};
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,76 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "acl_mvn.hpp"
namespace ov {
namespace intel_cpu {
using namespace arm_compute;
AclMVNExecutor::AclMVNExecutor(const ExecutorContext::CPtr context) : MVNExecutor(context) {}
bool AclMVNExecutor::init(const MVNAttrs& mvnAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const dnnl::primitive_attr &attr) {
auto srcDims = srcDescs[0]->getShape().getStaticDims();
auto dstDims = dstDescs[0]->getShape().getStaticDims();
size_t X, Y;
if (mvnAttrs.initAcrossChannels_) {
if (srcDims.size() >= 2) {
Y = srcDims[0];
X = srcDims[1];
for (int i = 2; i < srcDims.size(); i++) {
X *= srcDims[i];
}
} else {
Y = srcDims[0];
X = 1;
}
} else {
if (srcDims.size() > 2) {
Y = srcDims[0] * srcDims[1];
X = srcDims[2];
for (int i = 3; i < srcDims.size(); i++) {
X *= srcDims[i];
}
} else if (srcDims.size() == 2) {
Y = srcDims[0] * srcDims[1];
X = 1;
} else {
Y = srcDims[0];
X = 1;
}
}
TensorInfo srcTensorInfo = TensorInfo(TensorShape(X, Y), 1, precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[0]));
TensorInfo dstTensorInfo = TensorInfo(TensorShape(X, Y), 1, precisionToAclDataType(dstDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[0]));
if (!arm_compute::NEMeanStdDevNormalizationLayer::validate(&srcTensorInfo, &dstTensorInfo, mvnAttrs.epsValue_))
return false;
srcTensor.allocator()->init(srcTensorInfo);
dstTensor.allocator()->init(dstTensorInfo);
mvn = std::make_unique<arm_compute::NEMeanStdDevNormalizationLayer>();
mvn->configure(&srcTensor, &dstTensor, mvnAttrs.epsValue_);
return true;
}
void AclMVNExecutor::exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, const void *post_ops_data_) {
srcTensor.allocator()->import_memory(src[0]->GetPtr());
dstTensor.allocator()->import_memory(dst[0]->GetPtr());
mvn->run();
srcTensor.allocator()->free();
dstTensor.allocator()->free();
}
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,73 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "acl_utils.hpp"
#include "nodes/executors/mvn.hpp"
#include "arm_compute/runtime/NEON/NEFunctions.h"
namespace ov {
namespace intel_cpu {
class AclMVNExecutor : public MVNExecutor {
public:
AclMVNExecutor(const ExecutorContext::CPtr context);
bool init(const MVNAttrs& mvnAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const dnnl::primitive_attr &attr) override;
void exec(const std::vector<MemoryCPtr>& src,
const std::vector<MemoryPtr>& dst,
const void *post_ops_data_) override;
impl_desc_type getImplType() const override {
return implType;
}
private:
impl_desc_type implType = impl_desc_type::acl;
arm_compute::Tensor srcTensor;
arm_compute::Tensor dstTensor;
std::unique_ptr<arm_compute::NEMeanStdDevNormalizationLayer> mvn = nullptr;
};
class AclMVNExecutorBuilder : public MVNExecutorBuilder {
public:
bool isSupported(const MVNAttrs& mvnAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs) const override {
if ((srcDescs[0]->getPrecision() != InferenceEngine::Precision::FP32 &&
srcDescs[0]->getPrecision() != InferenceEngine::Precision::FP16) ||
srcDescs[0]->getPrecision() != dstDescs[0]->getPrecision())
return false;
if (!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) &&
dstDescs[0]->hasLayoutType(LayoutType::ncsp)) &&
!(srcDescs[0]->hasLayoutType(LayoutType::nspc) &&
dstDescs[0]->hasLayoutType(LayoutType::nspc)))
return false;
if (mvnAttrs.epsMode_ == MVNEpsMode::OUTSIDE_SQRT) {
return false;
}
if (!mvnAttrs.normalizeVariance_) {
return false;
}
if (!mvnAttrs.initAcrossChannels_ && getAclDataLayoutByMemoryDesc(srcDescs[0]) == arm_compute::DataLayout::NHWC) {
return false;
}
return true;
}
MVNExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const override {
return std::make_shared<AclMVNExecutor>(context);
}
};
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,183 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "acl_pooling.hpp"
#include "acl_utils.hpp"
namespace ov {
namespace intel_cpu {
using namespace arm_compute;
AclPoolingExecutor::AclPoolingExecutor(const ExecutorContext::CPtr context) : PoolingExecutor(context) {}
bool AclPoolingExecutor::isSupported(const TensorInfo& srcTensorInfo,
const TensorInfo& dstTensorInfo,
const PoolingAttrs& poolingAttrs,
size_t srcDimsSize,
size_t dstDescsSize,
DataLayout dataLayout,
const VectorDims* indDims,
PoolingLayerInfo* pool_info,
Pooling3dLayerInfo* pool3d_info) {
unsigned int pad_left = (poolingAttrs.data_pad_begin.size() >= 2) ? poolingAttrs.data_pad_begin[1] : poolingAttrs.data_pad_begin[0];
unsigned int pad_right = (poolingAttrs.data_pad_end.size() >= 2) ? poolingAttrs.data_pad_end[1] : poolingAttrs.data_pad_end[0];
unsigned int pad_top = (poolingAttrs.data_pad_begin.size() >= 2) ? poolingAttrs.data_pad_begin[0] : 0;
unsigned int pad_bottom = (poolingAttrs.data_pad_end.size() >= 2) ? poolingAttrs.data_pad_end[0] : 0;
unsigned int kernel_w = (poolingAttrs.kernel.size() >= 2) ? poolingAttrs.kernel[1] : poolingAttrs.kernel[0];
unsigned int kernel_h = (poolingAttrs.kernel.size() >= 2) ? poolingAttrs.kernel[0] : 1;
unsigned int stride_x = (poolingAttrs.stride.size() >= 2) ? poolingAttrs.stride[1] : poolingAttrs.stride[0];
unsigned int stride_y = (poolingAttrs.stride.size() >= 2) ? poolingAttrs.stride[0] : 1;
PoolingType pool_type;
bool exclude_padding = false;
if (poolingAttrs.algorithm == Algorithm::PoolingMax) {
pool_type = PoolingType::MAX;
exclude_padding = (poolingAttrs.pad_type != op::PadType::EXPLICIT);
} else if (poolingAttrs.algorithm == Algorithm::PoolingAvg) {
pool_type = PoolingType::AVG;
exclude_padding = poolingAttrs.exclude_pad;
} else {
DEBUG_LOG("Unknown pooling algorithm: ", static_cast<int>(poolingAttrs.algorithm));
return false;
}
DimensionRoundingType round = (poolingAttrs.rounding == op::RoundingType::CEIL) ?
DimensionRoundingType::CEIL : DimensionRoundingType::FLOOR;
if (srcDimsSize == 5) {
if (dstDescsSize > 1) {
DEBUG_LOG("NEPooling3dLayer does not support indices");
return false;
} else {
unsigned int kernel_d = poolingAttrs.kernel[2];
unsigned int stride_z = poolingAttrs.stride[2];
unsigned int pad_front = poolingAttrs.data_pad_begin[2];
unsigned int pad_back = poolingAttrs.data_pad_end[2];
pool3d_info->pool_type = pool_type;
pool3d_info->exclude_padding = exclude_padding;
pool3d_info->pool_size = arm_compute::Size3D(kernel_w, kernel_h, kernel_d);
pool3d_info->stride = arm_compute::Size3D(stride_x, stride_y, stride_z);
pool3d_info->padding = arm_compute::Padding3D(pad_left, pad_right, pad_top, pad_bottom, pad_front, pad_back);
pool3d_info->round_type = round;
arm_compute::Status s = arm_compute::NEPooling3dLayer::validate(&srcTensorInfo, &dstTensorInfo, *pool3d_info);
if (!s) {
DEBUG_LOG("NEPooling3dLayer validation failed: ", s.error_description());
return false;
}
}
} else {
pool_info->data_layout = dataLayout;
pool_info->pool_size = arm_compute::Size2D(kernel_w, kernel_h);
pool_info->pad_stride_info = arm_compute::PadStrideInfo(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, round);
pool_info->pool_type = pool_type;
pool_info->exclude_padding = exclude_padding;
if (dstDescsSize > 1) {
TensorInfo indTensorInfo = TensorInfo(shapeCast(*indDims), 1, arm_compute::DataType::U32, dataLayout);
arm_compute::Status s = arm_compute::NEPoolingLayer::validate(&srcTensorInfo, &dstTensorInfo, *pool_info, &indTensorInfo);
if (!s) {
DEBUG_LOG("NEPoolingLayer validation with indices failed: ", s.error_description());
return false;
}
} else {
arm_compute::Status s = arm_compute::NEPoolingLayer::validate(&srcTensorInfo, &dstTensorInfo, *pool_info);
if (!s) {
DEBUG_LOG("NEPoolingLayer validation without indices failed: ", s.error_description());
return false;
}
}
}
return true;
}
bool AclPoolingExecutor::init(const PoolingAttrs& poolingAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const dnnl::primitive_attr &attr) {
auto srcDims = srcDescs[0]->getShape().getStaticDims();
auto dstDims = dstDescs[0]->getShape().getStaticDims();
TensorInfo srcTensorInfo = TensorInfo(shapeCast(srcDims), 1,
precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[0]));
TensorInfo dstTensorInfo = TensorInfo(shapeCast(dstDims), 1,
precisionToAclDataType(dstDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[0]));
srcTensor.allocator()->init(srcTensorInfo);
dstTensor.allocator()->init(dstTensorInfo);
if (srcDims.size() == 5) {
if (dstDescs.size() == 1) {
Pooling3dLayerInfo pool_info;
if (!isSupported(srcTensorInfo,
dstTensorInfo,
poolingAttrs,
srcDims.size(),
dstDescs.size(),
getAclDataLayoutByMemoryDesc(srcDescs[0]),
nullptr,
nullptr,
&pool_info))
return false;
exec_func = [this, pool_info]{
auto acl_op = std::make_unique<arm_compute::NEPooling3dLayer>();
acl_op->configure(&srcTensor, &dstTensor, pool_info);
acl_op->run();
};
}
} else {
arm_compute::PoolingLayerInfo pool_info;
if (dstDescs.size() > 1) {
if (!isSupported(srcTensorInfo,
dstTensorInfo,
poolingAttrs,
srcDims.size(),
dstDescs.size(),
getAclDataLayoutByMemoryDesc(srcDescs[0]),
&dstDescs[1]->getShape().getStaticDims(),
&pool_info,
nullptr))
return false;
auto indDims = dstDescs[1]->getShape().getStaticDims();
TensorInfo indTensorInfo = TensorInfo(shapeCast(indDims), 1, precisionToAclDataType(dstDescs[1]->getPrecision()),
getAclDataLayoutByMemoryDesc(dstDescs[1]));
indTensor.allocator()->init(indTensorInfo);
exec_func = [this, pool_info]{
auto acl_op = std::make_unique<arm_compute::NEPoolingLayer>();
acl_op->configure(&srcTensor, &dstTensor, pool_info, &indTensor);
acl_op->run();
};
} else {
if (!isSupported(srcTensorInfo,
dstTensorInfo,
poolingAttrs,
srcDims.size(),
dstDescs.size(),
getAclDataLayoutByMemoryDesc(srcDescs[0]),
nullptr,
&pool_info,
nullptr))
return false;
exec_func = [this, pool_info]{
auto acl_op = std::make_unique<arm_compute::NEPoolingLayer>();
acl_op->configure(&srcTensor, &dstTensor, pool_info);
acl_op->run();
};
}
}
return true;
}
void AclPoolingExecutor::exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, std::unordered_map<int, MemoryPtr> postOpsArgs) {
srcTensor.allocator()->import_memory(src[0]->GetPtr());
dstTensor.allocator()->import_memory(dst[0]->GetPtr());
if (dst.size() > 1) indTensor.allocator()->import_memory(dst[1]->GetPtr());
exec_func();
srcTensor.allocator()->free();
dstTensor.allocator()->free();
if (dst.size() > 1) indTensor.allocator()->free();
}
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,131 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "nodes/executors/pooling.hpp"
#include "arm_compute/runtime/NEON/NEFunctions.h"
#include "utils/debug_capabilities.h"
namespace ov {
namespace intel_cpu {
class AclPoolingExecutor : public PoolingExecutor {
public:
AclPoolingExecutor(const ExecutorContext::CPtr context);
bool init(const PoolingAttrs& poolingAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const dnnl::primitive_attr &attr) override;
void exec(const std::vector<MemoryCPtr>& src,
const std::vector<MemoryPtr>& dst,
std::unordered_map<int, MemoryPtr> postOpsArgs) override;
static bool isSupported(const arm_compute::TensorInfo& srcTensorInfo,
const arm_compute::TensorInfo& dstTensorInfo,
const PoolingAttrs& poolingAttrs,
size_t srcDimsSize,
size_t dstDescsSize,
arm_compute::DataLayout dataLayout,
const VectorDims* indDims,
arm_compute::PoolingLayerInfo* pool_info,
arm_compute::Pooling3dLayerInfo* pool3d_info);
impl_desc_type getImplType() const override {
return implType;
}
private:
std::function<void()> exec_func;
PoolingAttrs poolingAttrs;
impl_desc_type implType = impl_desc_type::acl;
arm_compute::Tensor srcTensor;
arm_compute::Tensor dstTensor;
arm_compute::Tensor indTensor;
std::unique_ptr<arm_compute::NEPoolingLayer> pooling = nullptr;
};
class AclPoolingExecutorBuilder : public PoolingExecutorBuilder {
public:
bool isSupported(const PoolingAttrs& poolingAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs) const override {
if ((srcDescs[0]->getPrecision() != InferenceEngine::Precision::FP32 &&
dstDescs[0]->getPrecision() != InferenceEngine::Precision::FP32) &&
(srcDescs[0]->getPrecision() != InferenceEngine::Precision::FP16 &&
dstDescs[0]->getPrecision() != InferenceEngine::Precision::FP16)) {
DEBUG_LOG("AclPoolingExecutor does not support precisions:",
" src[0]=", srcDescs[0]->getPrecision(),
" dst[0]=", dstDescs[0]->getPrecision());
return false;
}
if (srcDescs.size() == 2 &&
(srcDescs[1]->getPrecision() != InferenceEngine::Precision::FP32 &&
srcDescs[0]->getPrecision() != InferenceEngine::Precision::FP32 &&
dstDescs[0]->getPrecision() != InferenceEngine::Precision::FP32) &&
(srcDescs[1]->getPrecision() != InferenceEngine::Precision::FP16 &&
srcDescs[0]->getPrecision() != InferenceEngine::Precision::FP16 &&
dstDescs[0]->getPrecision() != InferenceEngine::Precision::FP16)) {
DEBUG_LOG("AclPoolingExecutor does not support precisions:",
" src[0]=", srcDescs[0]->getPrecision(),
" src[1]=", srcDescs[1]->getPrecision(),
" dst[0]=", dstDescs[0]->getPrecision());
return false;
}
if (dstDescs.size() == 2 &&
dstDescs[1]->getPrecision() != InferenceEngine::Precision::U32) {
DEBUG_LOG("AclPoolingExecutor does not support precisions:",
" dst[1]=", dstDescs[1]->getPrecision());
return false;
}
if (srcDescs[0]->getShape().getRank() < 5) {
if (!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) &&
dstDescs[0]->hasLayoutType(LayoutType::ncsp)) &&
!(srcDescs[0]->hasLayoutType(LayoutType::nspc) &&
dstDescs[0]->hasLayoutType(LayoutType::nspc))) {
DEBUG_LOG("NEPoolingLayer does not support layouts:",
" src=", srcDescs[0]->serializeFormat(),
" dst=", dstDescs[0]->serializeFormat());
return false;
}
if (srcDescs.size() == 2 &&
!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) &&
srcDescs[1]->hasLayoutType(LayoutType::ncsp) &&
dstDescs[0]->hasLayoutType(LayoutType::ncsp)) &&
!(srcDescs[0]->hasLayoutType(LayoutType::nspc) &&
srcDescs[1]->hasLayoutType(LayoutType::nspc) &&
dstDescs[0]->hasLayoutType(LayoutType::nspc))) {
DEBUG_LOG("NEPoolingLayer does not support layouts:",
" src[0]=", srcDescs[0]->serializeFormat(),
" src[1]=", srcDescs[1]->serializeFormat(),
" dst=", dstDescs[0]->serializeFormat());
return false;
}
} else {
if (!(srcDescs[0]->hasLayoutType(LayoutType::nspc) &&
dstDescs[0]->hasLayoutType(LayoutType::nspc)) &&
!(srcDescs[0]->hasLayoutType(LayoutType::nspc) &&
dstDescs[0]->hasLayoutType(LayoutType::nspc))) {
DEBUG_LOG("Pooling3dLayer does not support layouts:",
" src=", srcDescs[0]->serializeFormat(),
" dst=", dstDescs[0]->serializeFormat());
return false;
}
}
return true;
}
PoolingExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const override {
return std::make_shared<AclPoolingExecutor>(context);
}
};
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,109 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "acl_utils.hpp"
#include "acl_reduce.hpp"
namespace ov {
namespace intel_cpu {
using namespace arm_compute;
static arm_compute::ReductionOperation getAclReductionOperationByAlgorithm(Algorithm algorithm) {
switch (algorithm) {
case Algorithm::ReduceMax: return arm_compute::ReductionOperation::MAX;
case Algorithm::ReduceMin: return arm_compute::ReductionOperation::MIN;
case Algorithm::ReduceSum: return arm_compute::ReductionOperation::SUM;
case Algorithm::ReduceProd: return arm_compute::ReductionOperation::PROD;
default: IE_THROW() << "Unsupported reduction operation: " << static_cast<int>(algorithm);
}
}
AclReduceExecutor::AclReduceExecutor(const ExecutorContext::CPtr context) : ReduceExecutor(context) {}
bool AclReduceExecutor::init(const ReduceAttrs& reduceAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const dnnl::primitive_attr &attr) {
if (reduceAttrs.operation != Algorithm::ReduceMax &&
reduceAttrs.operation != Algorithm::ReduceMin &&
reduceAttrs.operation != Algorithm::ReduceSum &&
reduceAttrs.operation != Algorithm::ReduceProd &&
reduceAttrs.operation != Algorithm::ReduceMean) {
DEBUG_LOG("Unknown reduce algorithm passed into AclReduceExecutor: ", static_cast<int>(reduceAttrs.operation));
return false;
}
this->reduceAttrs = reduceAttrs;
auto srcDims = srcDescs[0]->getShape().getStaticDims();
auto dstDims = dstDescs[0]->getShape().getStaticDims();
TensorInfo srcTensorInfo = TensorInfo(shapeCast(srcDims), 1,
precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[0]));
TensorInfo dstTensorInfo = TensorInfo(shapeCast(dstDims), 1,
precisionToAclDataType(dstDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[0]));
srcTensor.allocator()->init(srcTensorInfo);
dstTensor.allocator()->init(dstTensorInfo);
switch (reduceAttrs.operation) {
case Algorithm::ReduceMean: {
for (size_t i = 0; i < reduceAttrs.axes.size(); ++i) {
auto axe = axisCast(reduceAttrs.axes[i], srcDims.size());
auto pos = axisCast(i, reduceAttrs.axes.size());
axesMean.set(pos, axe);
}
Status reduceMeanStatus = NEReduceMean::validate(&srcTensorInfo, axesMean, reduceAttrs.keepDims, &dstTensorInfo);
if (!reduceMeanStatus) {
DEBUG_LOG("NEReduceMean validation failed: ", reduceMeanStatus.error_description());
return false;
}
exec_func = [this]{
auto acl_op = std::make_unique<arm_compute::NEReduceMean>();
acl_op->configure(&srcTensor, axesMean, this->reduceAttrs.keepDims, &dstTensor);
acl_op->run();
};
break;
}
case Algorithm::ReduceMax:
case Algorithm::ReduceMin:
case Algorithm::ReduceSum:
case Algorithm::ReduceProd: {
if (reduceAttrs.axes.size() != 1) {
return false;
}
Status reductionOperationStatus = NEReductionOperation::validate(&srcTensorInfo, &dstTensorInfo, axisCast(reduceAttrs.axes[0], srcDims.size()),
getAclReductionOperationByAlgorithm(reduceAttrs.operation), reduceAttrs.keepDims);
if (!reductionOperationStatus) {
DEBUG_LOG("NEReductionOperation validation with indices failed: ", reductionOperationStatus.error_description());
return false;
}
exec_func = [this, srcDims]{
auto acl_op = std::make_unique<arm_compute::NEReductionOperation>();
acl_op->configure(&srcTensor, &dstTensor, axisCast(this->reduceAttrs.axes[0], srcDims.size()),
getAclReductionOperationByAlgorithm(this->reduceAttrs.operation), this->reduceAttrs.keepDims);
acl_op->run();
};
break;
}
default:
IE_THROW() << "Unsupported operation type for ACL Reduce executor: " << static_cast<int>(reduceAttrs.operation);
}
return true;
}
void AclReduceExecutor::exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, const void *post_ops_data_) {
srcTensor.allocator()->import_memory(src[0]->GetPtr());
dstTensor.allocator()->import_memory(dst[0]->GetPtr());
exec_func();
srcTensor.allocator()->free();
dstTensor.allocator()->free();
}
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,75 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
// TODO: remove relative path
#include "../reduce.hpp"
#include "arm_compute/runtime/NEON/NEFunctions.h"
#include "utils/debug_capabilities.h"
namespace ov {
namespace intel_cpu {
class AclReduceExecutor : public ReduceExecutor {
public:
AclReduceExecutor(const ExecutorContext::CPtr context);
bool init(const ReduceAttrs& reduceAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const dnnl::primitive_attr &attr) override;
void exec(const std::vector<MemoryCPtr>& src,
const std::vector<MemoryPtr>& dst,
const void *post_ops_data_) override;
impl_desc_type getImplType() const override {
return implType;
}
private:
std::function<void()> exec_func;
ReduceAttrs reduceAttrs;
impl_desc_type implType = impl_desc_type::acl;
arm_compute::Coordinates axesMean;
arm_compute::Tensor srcTensor;
arm_compute::Tensor dstTensor;
};
class AclReduceExecutorBuilder : public ReduceExecutorBuilder {
public:
bool isSupported(const ReduceAttrs& reduceAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs) const override {
if (reduceAttrs.operation == Algorithm::ReduceMean) {
if (srcDescs[0]->getPrecision() != dstDescs[0]->getPrecision() ||
(srcDescs[0]->getPrecision() != InferenceEngine::Precision::FP32 &&
srcDescs[0]->getPrecision() != InferenceEngine::Precision::FP16)) {
DEBUG_LOG("NEReduceMean does not support precisions:",
" src[0]=", srcDescs[0]->getPrecision(),
" dst[0]=", dstDescs[0]->getPrecision());
return false;
}
} else {
if (srcDescs[0]->getPrecision() != dstDescs[0]->getPrecision() ||
(srcDescs[0]->getPrecision() != InferenceEngine::Precision::FP32 &&
srcDescs[0]->getPrecision() != InferenceEngine::Precision::FP16 &&
srcDescs[0]->getPrecision() != InferenceEngine::Precision::I32)) {
DEBUG_LOG("NEReductionOperation does not support precisions:",
" src[0]=", srcDescs[0]->getPrecision(),
" dst[0]=", dstDescs[0]->getPrecision());
return false;
}
}
return true;
}
ReduceExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const override {
return std::make_shared<AclReduceExecutor>(context);
}
};
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,81 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "ie_precision.hpp"
#include "memory_desc/cpu_memory_desc.h"
#include "arm_compute/core/Types.h"
namespace ov {
namespace intel_cpu {
/**
* @brief Return ComputeLibrary TensorShape with reverted layout schema used in ACL
* @param dims vector of dimensions to convert
* @return ComputeLibrary TensorShape object
*/
inline arm_compute::TensorShape shapeCast(const VectorDims& dims) {
arm_compute::TensorShape tensorShape;
for (std::size_t i = 0; i < dims.size(); ++i) {
tensorShape.set(dims.size() - i - 1, dims[i], false);
}
if (tensorShape.num_dimensions() == 0) {
tensorShape.set(0, 1, false);
tensorShape.set_num_dimensions(1);
}
return tensorShape;
}
inline std::size_t axisCast(const std::size_t axis, const std::size_t shapeSize) {
return shapeSize - axis - 1;
}
inline Dim vectorProduct(const VectorDims& vec, size_t size) {
Dim prod = 1;
for (size_t i = 0; i < size; ++i)
prod *= vec[i];
return prod;
}
/**
* @brief Return ComputeLibrary DataType that corresponds to the given precision
* @param precision precision to be converted
* @return ComputeLibrary DataType or UNKNOWN if precision is not mapped to DataType
*/
inline arm_compute::DataType precisionToAclDataType(InferenceEngine::Precision precision) {
switch (precision) {
case InferenceEngine::Precision::I8: return arm_compute::DataType::S8;
case InferenceEngine::Precision::U8: return arm_compute::DataType::U8;
case InferenceEngine::Precision::I16: return arm_compute::DataType::S16;
case InferenceEngine::Precision::U16: return arm_compute::DataType::U16;
case InferenceEngine::Precision::I32: return arm_compute::DataType::S32;
case InferenceEngine::Precision::U32: return arm_compute::DataType::U32;
case InferenceEngine::Precision::FP16: return arm_compute::DataType::F16;
case InferenceEngine::Precision::FP32: return arm_compute::DataType::F32;
case InferenceEngine::Precision::FP64: return arm_compute::DataType::F64;
case InferenceEngine::Precision::I64: return arm_compute::DataType::S64;
case InferenceEngine::Precision::BF16: return arm_compute::DataType::BFLOAT16;
default: return arm_compute::DataType::UNKNOWN;
}
}
/**
* @brief Return ComputeLibrary DataLayout that corresponds to MemoryDecs layout
* @param desc MemoryDecs from which layout is retrieved
* @param treatAs4D the flag that treats MemoryDecs as 4D shape
* @return ComputeLibrary DataLayout or UNKNOWN if MemoryDecs layout is not mapped to DataLayout
*/
inline arm_compute::DataLayout getAclDataLayoutByMemoryDesc(MemoryDescCPtr desc) {
if (desc->hasLayoutType(LayoutType::ncsp)) {
if (desc->getShape().getRank() <= 4) return arm_compute::DataLayout::NCHW;
if (desc->getShape().getRank() == 5) return arm_compute::DataLayout::NCDHW;
} else if (desc->hasLayoutType(LayoutType::nspc)) {
if (desc->getShape().getRank() <= 4) return arm_compute::DataLayout::NHWC;
if (desc->getShape().getRank() == 5) return arm_compute::DataLayout::NDHWC;
}
return arm_compute::DataLayout::UNKNOWN;
}
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,15 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0mvn
//
#include "eltwise.hpp"
namespace ov {
namespace intel_cpu {
using namespace InferenceEngine;
EltwiseExecutor::EltwiseExecutor(const ExecutorContext::CPtr context) : context(context) {}
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,109 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "cpu_memory.h"
#include "onednn/iml_type_mapper.h"
#include "executor.hpp"
namespace ov {
namespace intel_cpu {
struct EltwiseAttrs {
Algorithm algorithm;
float alpha;
float beta;
float gamma;
EltwiseAttrs() : algorithm(Algorithm::Default), alpha(0), beta(0), gamma(0) {}
EltwiseAttrs(Algorithm algorithm, float alpha, float beta, float gamma) : algorithm(algorithm), alpha(alpha), beta(beta), gamma(gamma) {}
bool operator==(const EltwiseAttrs& rhs) const {
bool retVal = true;
retVal = algorithm == rhs.algorithm &&
alpha == rhs.alpha &&
beta == rhs.beta &&
gamma == rhs.gamma;
return retVal;
}
};
enum class EltwisePostOpType {
Undefined,
Eltwise,
Dnnl
};
class EltwisePostOp {
public:
EltwisePostOp(EltwiseAttrs eltwise) {
type = EltwisePostOpType::Eltwise;
this->eltwise = eltwise;
}
EltwisePostOp(dnnl::post_ops dnnlPostOps) {
type = EltwisePostOpType::Dnnl;
this->dnnlPostOps = dnnlPostOps;
}
~EltwisePostOp() = default;
EltwiseAttrs eltwise;
dnnl::post_ops dnnlPostOps;
EltwisePostOpType type = EltwisePostOpType::Undefined;
bool operator==(const EltwisePostOp &rhs) const {
if (type != rhs.type) { return false; }
bool ret = true;
switch (type) {
case EltwisePostOpType::Eltwise:
ret = eltwise == rhs.eltwise;
break;
case EltwisePostOpType::Dnnl:
ret = dnnlPostOps == rhs.dnnlPostOps;
break;
default: assert(!"unsupported eltwise post operation type");
}
return ret;
}
};
class EltwiseExecutor {
public:
EltwiseExecutor(const ExecutorContext::CPtr context);
virtual bool init(const EltwiseAttrs& eltwiseAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const std::vector<EltwisePostOp>& postOps) = 0;
virtual void exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, const void *post_ops_data_) = 0;
virtual ~EltwiseExecutor() = default;
virtual impl_desc_type getImplType() const = 0;
protected:
EltwiseAttrs eltwiseAttrs;
const ExecutorContext::CPtr context;
};
using EltwiseExecutorPtr = std::shared_ptr<EltwiseExecutor>;
using EltwiseExecutorCPtr = std::shared_ptr<const EltwiseExecutor>;
class EltwiseExecutorBuilder {
public:
~EltwiseExecutorBuilder() = default;
virtual bool isSupported(const EltwiseAttrs& eltwiseAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs) const = 0;
virtual EltwiseExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const = 0;
};
using EltwiseExecutorBuilderPtr = std::shared_ptr<EltwiseExecutorBuilder>;
using EltwiseExecutorBuilderCPtr = std::shared_ptr<const EltwiseExecutorBuilder>;
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,19 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "eltwise_list.hpp"
namespace ov {
namespace intel_cpu {
const std::vector<EltwiseExecutorDesc>& getEltwiseExecutorsList() {
static std::vector<EltwiseExecutorDesc> descs = {
OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared<AclEltwiseExecutorBuilder>())
};
return descs;
}
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,84 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "executor.hpp"
#include "eltwise.hpp"
#if defined(OV_CPU_WITH_ACL)
#include "acl/acl_eltwise.hpp"
#endif
#include "onednn/iml_type_mapper.h"
#include "common/primitive_cache.hpp"
namespace ov {
namespace intel_cpu {
struct EltwiseExecutorDesc {
ExecutorType executorType;
EltwiseExecutorBuilderCPtr builder;
};
const std::vector<EltwiseExecutorDesc>& getEltwiseExecutorsList();
class EltwiseExecutorFactory : public ExecutorFactory {
public:
EltwiseExecutorFactory(const EltwiseAttrs& eltwiseAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const ExecutorContext::CPtr context) : ExecutorFactory(context) {
for (auto& desc : getEltwiseExecutorsList()) {
if (desc.builder->isSupported(eltwiseAttrs, srcDescs, dstDescs)) {
supportedDescs.push_back(desc);
}
}
}
~EltwiseExecutorFactory() = default;
virtual EltwiseExecutorPtr makeExecutor(const EltwiseAttrs& eltwiseAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const std::vector<EltwisePostOp>& postOps) {
auto build = [&](const EltwiseExecutorDesc* desc) {
auto executor = desc->builder->makeExecutor(context);
if (executor->init(eltwiseAttrs, srcDescs, dstDescs, postOps)) {
return executor;
}
EltwiseExecutorPtr ptr = nullptr;
return ptr;
};
if (chosenDesc) {
if (auto executor = build(chosenDesc)) {
return executor;
}
}
for (const auto& sd : supportedDescs) {
if (auto executor = build(&sd)) {
chosenDesc = &sd;
return executor;
}
}
IE_THROW() << "Supported Eltwise executor is not found";
}
bool isEmpty() {
return supportedDescs.empty();
}
private:
std::vector<EltwiseExecutorDesc> supportedDescs;
const EltwiseExecutorDesc* chosenDesc = nullptr;
};
using EltwiseExecutorFactoryPtr = std::shared_ptr<EltwiseExecutorFactory>;
using EltwiseExecutorFactoryCPtr = std::shared_ptr<const EltwiseExecutorFactory>;
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,95 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "cache/multi_cache.h"
#include "graph_context.h"
#include "onednn/iml_type_mapper.h"
namespace ov {
namespace intel_cpu {
#if defined(OV_CPU_WITH_ACL)
#define OV_CPU_INSTANCE_ACL(...) \
{__VA_ARGS__},
#else
#define OV_CPU_INSTANCE_ACL(...)
#endif
#if defined(OV_CPU_WITH_DNNL)
#define OV_CPU_INSTANCE_DNNL(...) \
{__VA_ARGS__},
#else
#define OV_CPU_INSTANCE_DNNL(...)
#endif
#if defined(OPENVINO_ARCH_X86_64)
#define OV_CPU_INSTANCE_X64(...) \
{__VA_ARGS__},
#else
#define OV_CPU_INSTANCE_X64(...)
#endif
#define OV_CPU_INSTANCE_COMMON(...) \
{__VA_ARGS__},
enum class ExecutorType {
Undefined,
Common,
x64,
Dnnl,
Acl
};
class ExecutorContext {
public:
typedef std::shared_ptr<ExecutorContext> Ptr;
typedef std::shared_ptr<const ExecutorContext> CPtr;
ExecutorContext(const GraphContext::CPtr graphContext, const std::vector<impl_desc_type>& implPriorities) {
this->runtimeCache = graphContext->getParamsCache();
this->scratchPad = graphContext->getScratchPad();
this->engine = graphContext->getEngine();
this->implPriorities = implPriorities;
}
MultiCacheWeakPtr getRuntimeCache() const {
return runtimeCache;
}
DnnlScratchPadPtr getScratchPad() const {
return scratchPad;
}
dnnl::engine getEngine() const {
return engine;
}
const std::vector<impl_desc_type>& getImplPriorities() const {
return implPriorities;
}
private:
// weak_ptr is required to avoid cycle dependencies with MultiCache
// since ExecutorContext is stored in Executor itself
MultiCacheWeakPtr runtimeCache;
DnnlScratchPadPtr scratchPad = nullptr;
dnnl::engine engine;
std::vector<impl_desc_type> implPriorities = {};
};
class ExecutorFactory {
public:
ExecutorFactory(const ExecutorContext::CPtr context) : context(context) {}
virtual ~ExecutorFactory() = default;
const ExecutorContext::CPtr context;
};
using ExecutorFactoryPtr = std::shared_ptr<ExecutorFactory>;
using ExecutorFactoryCPtr = std::shared_ptr<const ExecutorFactory>;
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,528 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "interpolate.hpp"
#include "ie_parallel.hpp"
#include "nodes/common/cpu_memcpy.h"
#include "emitters/x64/jit_load_store_emitters.hpp"
bool ov::intel_cpu::InterpolateExecutor::init(const InterpolateAttrs& interpolateAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const dnnl::primitive_attr &attr) {
const auto &srcDims = srcDescs[0]->getShape().getStaticDims();
const auto &dstDims = dstDescs[0]->getShape().getStaticDims();
interpAttrs = interpolateAttrs;
srcDimPad5d = to5Dim(getPaddedInputShape(srcDims, interpolateAttrs.padBegin, interpolateAttrs.padEnd));
dstDim5d = to5Dim(dstDims);
srcDataSize = interpolateAttrs.inPrc.size();
dstDataSize = interpolateAttrs.outPrc.size();
dataRank = srcDims.size();
spatialDimSize = getSpatialDimsNum(dataRank);
switch (interpAttrs.mode) {
case InterpolateMode::nearest: {
buildTblNN(srcDimPad5d, dstDim5d, interpAttrs.dataScales, interpolateAttrs.layout, interpolateAttrs.nearestMode);
break;
}
case InterpolateMode::linear_onnx: {
buildTblLinearOnnx(srcDimPad5d, dstDim5d, interpAttrs.dataScales, interpolateAttrs.layout);
break;
}
case InterpolateMode::linear: {
static constexpr int LINEAR_KERNEL = 2;
buildTblLinear(srcDimPad5d, dstDim5d, interpAttrs.dataScales, LINEAR_KERNEL, interpolateAttrs.antialias);
break;
}
case InterpolateMode::cubic: {
buildTblCubic(srcDimPad5d, dstDim5d, interpAttrs.dataScales, interpolateAttrs.cubeCoeff, interpolateAttrs.layout);
break;
}
default: {
IE_THROW() << "Interpolate executor does not support interpolate mode: " << interpAttrs.mode;
break;
}
}
return true;
}
// =====================================================================================================================
// index layout:
// d_0............d_OD-1, h_0..............h_OH-1, w_0................w_OW-1
void ov::intel_cpu::InterpolateExecutor::buildTblNN(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d,
const std::vector<float>& dataScales, InterpolateLayoutType layout, InterpolateNearestMode nearestMode) {
const int dimSize = dataRank;
float fz = (dimSize == 5) ? dataScales[dimSize - 3] : 1.f;
float fy = dataScales[dimSize - 2];
float fx = dataScales[dimSize - 1];
size_t ID = srcDimPad5d[2], IH = srcDimPad5d[3], IW = srcDimPad5d[4];
size_t OD = dstDim5d[2], OH = dstDim5d[3], OW = dstDim5d[4];
indexTable.resize(OD + OH + OW);
bool isDDownsample = (fz < 1) ? true : false;
bool isHDownsample = (fy < 1) ? true : false;
bool isWDownsample = (fx < 1) ? true : false;
for (int oz = 0; oz < OD; oz++) {
float iz = coordTransToInput(oz, fz, ID, OD);
indexTable[oz] = nearestRound(iz, isDDownsample, nearestMode);
indexTable[oz] = clipCoord(indexTable[oz], ID);
}
for (int oy = 0; oy < OH; oy++) {
float iy = coordTransToInput(oy, fy, IH, OH);
indexTable[OD + oy] = nearestRound(iy, isHDownsample, nearestMode);
indexTable[OD + oy] = clipCoord(indexTable[OD + oy], IH);
}
for (int ox = 0; ox < OW; ox++) {
float ix = coordTransToInput(ox, fx, IW, OW);
indexTable[OD + OH + ox] = nearestRound(ix, isWDownsample, nearestMode);
indexTable[OD + OH + ox] = clipCoord(indexTable[OD + OH + ox], IW);
}
}
// scale is float(outShape) / float(inShape)
// strictly consistent with onnx calc manner(div scale, not multiply inverse), given this is done offline
// the slight precison diff can produce obvious wrong value due to "nearest round" behavior for NN mode
float ov::intel_cpu::InterpolateExecutor::coordTransToInput(int outCoord, float scale, int inShape, int outShape) const {
if (scale == 1.0f || (inShape == outShape)) {
return outCoord;
}
switch (interpAttrs.coordTransMode) {
case InterpolateCoordTransMode::half_pixel: {
return (outCoord + 0.5f) / scale - 0.5f;
break;
}
case InterpolateCoordTransMode::pytorch_half_pixel: {
if (outShape > 1)
return (outCoord + 0.5f) / scale - 0.5f;
else
return 0;
break;
}
case InterpolateCoordTransMode::asymmetric: {
return static_cast<float>(outCoord) / scale;
break;
}
case InterpolateCoordTransMode::tf_half_pixel_for_nn: {
return (outCoord + 0.5f) / scale;
break;
}
case InterpolateCoordTransMode::align_corners: {
if (outShape > 1)
return outCoord * (static_cast<float>(inShape - 1) / static_cast<float>(outShape - 1));
else
return 0;
break;
}
default: {
IE_THROW() << "errorPrefix" << " does not support specified coordinate transformation mode";
break;
}
}
}
int ov::intel_cpu::InterpolateExecutor::nearestRound(float originCoord, bool isDownsample, InterpolateNearestMode nearestMode) const {
switch (nearestMode) {
case InterpolateNearestMode::round_prefer_floor: {
if (originCoord == (static_cast<int>(originCoord) + 0.5f))
return static_cast<int>(std::floor(originCoord));
else
return static_cast<int>(std::round(originCoord));
break;
}
case InterpolateNearestMode::round_prefer_ceil: {
return static_cast<int>(std::round(originCoord));
break;
}
case InterpolateNearestMode::floor: {
return static_cast<int>(std::floor(originCoord));
break;
}
case InterpolateNearestMode::ceil: {
return static_cast<int>(std::ceil(originCoord));
break;
}
case InterpolateNearestMode::simple: {
if (isDownsample)
return static_cast<int>(std::ceil(originCoord));
else
return static_cast<int>(originCoord);
}
default: {
IE_THROW() << "errorPrefix" << " does not support specified nearest round mode";
break;
}
}
}
void ov::intel_cpu::InterpolateExecutor::linearOnnxCF(int outCoord, float scale, int inShape, int outShape,
int& index0, int& index1, float& weight0, float& weight1) {
float inCoord = coordTransToInput(outCoord, scale, inShape, outShape);
inCoord = std::max(0.0f, std::min(inCoord, static_cast<float>(inShape - 1)));
index0 = std::min(static_cast<int>(inCoord), inShape - 1);
index1 = std::min(index0 + 1, inShape - 1);
weight1 = std::fabs(inCoord - index0);
weight0 = std::fabs(inCoord - index1);
if (index0 == index1) {
weight0 = 0.5f;
weight1 = 0.5f;
}
}
void ov::intel_cpu::InterpolateExecutor::buildTblLinearOnnx(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d,
const std::vector<float>& dataScales, InterpolateLayoutType layout) {
int dimSize = dataRank;
float fz = (spatialDimSize > 2) ? dataScales[dimSize - 3] : 1.f;
float fy = (spatialDimSize > 1) ? dataScales[dimSize - 2] : 1.f;
float fx = dataScales[dimSize - 1];
int ID = srcDimPad5d[2], IH = srcDimPad5d[3], IW = srcDimPad5d[4];
int OD = dstDim5d[2], OH = dstDim5d[3], OW = dstDim5d[4];
std::vector<int*> indexPtr(MAX_INPUT_INTERPOLATE, 0);
std::vector<float*> weightPtr(MAX_INPUT_INTERPOLATE, 0);
if (layout == InterpolateLayoutType::planar) {
// FrontTopLeft:0, FrontTopRight:1, FrontBottomLeft:2, FrontBottomRight:3,
// EndTopLeft:4, EndTopRight:5, EndBottomLeft:6, EndBottomRight:7
// weight: Left:0, ritht:1, top:2, bottom:3, front:4, end:5
int eltInGrid = (spatialDimSize > 2) ? MAX_INPUT_INTERPOLATE : ((spatialDimSize > 1) ? 4 : 2);
int idxType = 2;
int scratchLen = rnd_up(eltInGrid * OW * OH * OD, 16);
indexTable.resize(idxType * scratchLen);
indexPtr[0] = static_cast<int*>(&indexTable[0]);
indexPtr[1] = static_cast<int*>(&indexTable[OW * OH * OD]);
weightPtr[0] = reinterpret_cast<float*>(&indexTable[scratchLen]);
weightPtr[1] = reinterpret_cast<float*>(&indexTable[scratchLen + OW * OH * OD]);
if (spatialDimSize > 1) {
indexPtr[2] = static_cast<int*>(&indexTable[2 * OW * OH * OD]);
indexPtr[3] = static_cast<int*>(&indexTable[3 * OW * OH * OD]);
weightPtr[2] = reinterpret_cast<float*>(&indexTable[scratchLen + 2 * OW * OH * OD]);
weightPtr[3] = reinterpret_cast<float*>(&indexTable[scratchLen + 3 * OW * OH * OD]);
}
if (spatialDimSize > 2) {
indexPtr[4] = static_cast<int*>(&indexTable[4 * OW * OH * OD]);
indexPtr[5] = static_cast<int*>(&indexTable[5 * OW * OH * OD]);
indexPtr[6] = static_cast<int*>(&indexTable[6 * OW * OH * OD]);
indexPtr[7] = static_cast<int*>(&indexTable[7 * OW * OH * OD]);
weightPtr[4] = reinterpret_cast<float*>(&indexTable[scratchLen + 4 * OW * OH * OD]);
weightPtr[5] = reinterpret_cast<float*>(&indexTable[scratchLen + 5 * OW * OH * OD]);
}
int scale = dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::sse41) ? srcDataSize : 1;
for (int oz = 0; oz < OD; oz++) {
int izF, izE;
float weightF, weightE;
linearOnnxCF(oz, fz, ID, OD, izF, izE, weightF, weightE);
int idxOz = oz * OH * OW;
for (int oy = 0; oy < OH; oy++) {
int iyT, iyB;
float weightT, weightB;
linearOnnxCF(oy, fy, IH, OH, iyT, iyB, weightT, weightB);
int idxOzOy = idxOz + oy * OW;
for (int ox = 0; ox < OW; ox++) {
int ixL, ixR;
float weightL, weightR;
linearOnnxCF(ox, fx, IW, OW, ixL, ixR, weightL, weightR);
int idxOzOyOx = idxOzOy + ox;
indexPtr[0][idxOzOyOx] = (izF * IH * IW + iyT * IW + ixL) * scale;
indexPtr[1][idxOzOyOx] = (izF * IH * IW + iyT * IW + ixR) * scale;
weightPtr[0][idxOzOyOx] = weightL;
weightPtr[1][idxOzOyOx] = weightR;
if (spatialDimSize > 1) {
indexPtr[2][idxOzOyOx] = (izF * IH * IW + iyB * IW + ixL) * scale;
indexPtr[3][idxOzOyOx] = (izF * IH * IW + iyB * IW + ixR) * scale;
weightPtr[2][idxOzOyOx] = weightT;
weightPtr[3][idxOzOyOx] = weightB;
}
if (spatialDimSize > 2) {
indexPtr[4][idxOzOyOx] = (izE * IH * IW + iyT * IW + ixL) * scale;
indexPtr[5][idxOzOyOx] = (izE * IH * IW + iyT * IW + ixR) * scale;
indexPtr[6][idxOzOyOx] = (izE * IH * IW + iyB * IW + ixL) * scale;
indexPtr[7][idxOzOyOx] = (izE * IH * IW + iyB * IW + ixR) * scale;
weightPtr[4][idxOzOyOx] = weightF;
weightPtr[5][idxOzOyOx] = weightE;
}
}
}
}
} else {
// index: left:OW right:OW Top:OH Bottom:OH, Front:OD, End:OD
// weight:same as index
size_t scratchLen = rnd_up(OW + OW + OH + OH + OD + OD, 16);
int idxType = 2;
indexTable.resize(idxType * scratchLen);
indexPtr[0] = static_cast<int*>(&indexTable[0]);
indexPtr[1] = static_cast<int*>(&indexTable[OW]);
indexPtr[2] = static_cast<int*>(&indexTable[2 * OW]);
indexPtr[3] = static_cast<int*>(&indexTable[2 * OW + OH]);
indexPtr[4] = static_cast<int*>(&indexTable[2 * OW + 2 * OH]);
indexPtr[5] = static_cast<int*>(&indexTable[2 * OW + 2 * OH + OD]);
weightPtr[0] = reinterpret_cast<float*>(&indexTable[scratchLen]);
weightPtr[1] = reinterpret_cast<float*>(&indexTable[scratchLen + OW]);
weightPtr[2] = reinterpret_cast<float*>(&indexTable[scratchLen + 2 * OW]);
weightPtr[3] = reinterpret_cast<float*>(&indexTable[scratchLen + 2 * OW + OH]);
weightPtr[4] = reinterpret_cast<float*>(&indexTable[scratchLen + 2 * OW + 2 * OH]);
weightPtr[5] = reinterpret_cast<float*>(&indexTable[scratchLen + 2 * OW + 2 * OH + OD]);
for (int ox = 0; ox < OW; ox++) {
linearOnnxCF(ox, fx, IW, OW, indexPtr[0][ox], indexPtr[1][ox], weightPtr[0][ox], weightPtr[1][ox]);
}
for (int oy = 0; oy < OH; oy++) {
linearOnnxCF(oy, fy, IH, OH, indexPtr[2][oy], indexPtr[3][oy], weightPtr[2][oy], weightPtr[3][oy]);
}
for (int oz = 0; oz < OD; oz++) {
linearOnnxCF(oz, fz, ID, OD, indexPtr[4][oz], indexPtr[5][oz], weightPtr[4][oz], weightPtr[5][oz]);
}
}
}
// table layout:
// wd .........wd, wh............wh, ww.............ww, id...........id, ih............ih, iw..............iw
// | |
// wh0.....wh_diameter ih0.....ih_diameter
void ov::intel_cpu::InterpolateExecutor::buildTblLinear(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d,
const std::vector<float>& dataScales, int kernel_width, bool antialias) {
int dimSize = dataRank;
float fz = (dimSize == 5) ? dataScales[dimSize - 3] : 1.f;
float fy = dataScales[dimSize - 2];
float fx = dataScales[dimSize - 1];
size_t ID = srcDimPad5d[2], IH = srcDimPad5d[3], IW = srcDimPad5d[4];
size_t OD = dstDim5d[2], OH = dstDim5d[3], OW = dstDim5d[4];
if (!(IW == OW && IH == OH && ID == OD)) {
float ax = antialias ? fx : 1.0f;
float ay = antialias ? fy : 1.0f;
float az = antialias ? fz : 1.0f;
int rx = (fx > 1.0f) ? 2 : static_cast<int>(ceil(static_cast<float>(kernel_width) / ax));
int ry = (fy > 1.0f) ? 2 : static_cast<int>(ceil(static_cast<float>(kernel_width) / ay));
int rz = (fz > 1.0f) ? 2 : static_cast<int>(ceil(static_cast<float>(kernel_width) / az));
int diaOD = 2 * rz + 1;
int diaOH = 2 * ry + 1;
int diaOW = 2 * rx + 1;
int sizeOD = OD * diaOD;
int sizeOH = OH * diaOH;
int sizeOW = OW * diaOW;
indexTable.resize((sizeOD + sizeOH + sizeOW) * 2);
float *weightTable = reinterpret_cast<float*>(&indexTable[0]);
float *weightOD = static_cast<float*>(&weightTable[0]);
float *weightOH = static_cast<float*>(&weightTable[sizeOD]);
float *weightOW = static_cast<float*>(&weightTable[sizeOD + sizeOH]);
int *idxTable = static_cast<int*>(&indexTable[sizeOD + sizeOH + sizeOW]);
int *idxOD = static_cast<int*>(&idxTable[0]);
int *idxOH = static_cast<int*>(&idxTable[sizeOD]);
int *idxOW = static_cast<int*>(&idxTable[sizeOD + sizeOH]);
for (int oz = 0; oz < OD; oz++) {
float iz = coordTransToInput(oz, fz, ID, OD);
int iz_r = static_cast<int>(std::round(iz));
for (int r = iz_r - rz, i = 0; r <= iz_r + rz; r++, i++) {
idxOD[oz * diaOD + i] = r;
if (r < 0 || r >= static_cast<int>(ID)) {
weightOD[oz * diaOD + i] = 0.f;
} else {
float dz = iz - r;
weightOD[oz * diaOD + i] = az * triangleCoeff(az * dz);
}
}
}
for (int oy = 0; oy < OH; oy++) {
float iy = coordTransToInput(oy, fy, IH, OH);
int iy_r = static_cast<int>(std::round(iy));
for (int r = iy_r - ry, i = 0; r <= iy_r + ry; r++, i++) {
idxOH[oy * diaOH + i] = r;
if (r < 0 || r >= static_cast<int>(IH)) {
weightOH[oy * diaOH + i] = 0.f;
} else {
float dy = iy - r;
weightOH[oy * diaOH + i] = ay * triangleCoeff(ay * dy);
}
}
}
for (int ox = 0; ox < OW; ox++) {
float ix = coordTransToInput(ox, fx, IW, OW);
int ix_r = static_cast<int>(std::round(ix));
for (int r = ix_r - rx, i = 0; r <= ix_r + rx; r++, i++) {
idxOW[ox * diaOW + i] = r;
if (r < 0 || r >= static_cast<int>(IW)) {
weightOW[ox * diaOW + i] = 0.f;
} else {
float dx = ix - r;
weightOW[ox * diaOW + i] = ax * triangleCoeff(ax * dx);
}
}
}
}
}
std::vector<float> ov::intel_cpu::InterpolateExecutor::getCubicCoeffs(float mantissa, float a) {
float m = std::fabs(mantissa);
std::vector<float> coeffs(4, 0.f);
coeffs[0] = a * (m - 1.0) * (m - 1.0) * m;
coeffs[1] = ((a + 2.0) * m - (a + 3.0)) * m * m + 1.0;
coeffs[2] = (((-a - 2.0) * m + (2.0 * a + 3.0)) * m - a) * m;
coeffs[3] = -a * m * m * (m - 1.0);
return coeffs;
}
// table layout:
// OW OW OW OW OW OH OH OH OH OH
// x_idx x_weight0 x_weight1 x_weight2 x_weight3 y_idx y_weight0 y_weight1 y_weight2 y_weight3
void ov::intel_cpu::InterpolateExecutor::buildTblCubic(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, const std::vector<float>& dataScales,
float cubicCoeff, InterpolateLayoutType layout) {
int dimSize = dataRank;
float fy = dataScales[dimSize - 2];
float fx = dataScales[dimSize - 1];
int IH = srcDimPad5d[3], IW = srcDimPad5d[4];
int OH = dstDim5d[3], OW = dstDim5d[4];
// idxNum for index, CUBIC_GRID_LEN for weight
const int idxNum = 1;
size_t idxWeightSize = (CUBIC_GRID_LEN + idxNum) * OW + (CUBIC_GRID_LEN + idxNum) * OH;
if (layout != InterpolateLayoutType::planar) {
indexTable.resize(idxWeightSize);
} else {
size_t sequenceSize = 2 * OH * OW;
indexTable.resize(idxWeightSize + sequenceSize);
}
int tblAdvance = 0;
int *xOrigin = static_cast<int*>(&indexTable[tblAdvance]);
tblAdvance += OW;
float *xFactor = reinterpret_cast<float*>(&indexTable[tblAdvance]);
for (int ox = 0; ox < OW; ox++) {
float ix = coordTransToInput(ox, fx, IW, OW);
int ix_r = static_cast<int>(std::floor(ix));
xOrigin[ox] = ix_r;
float m = ix - ix_r;
std::vector<float> coffes = getCubicCoeffs(m, cubicCoeff);
xFactor[CUBIC_GRID_LEN * ox] = coffes[0];
xFactor[CUBIC_GRID_LEN * ox + 1] = coffes[1];
xFactor[CUBIC_GRID_LEN * ox + 2] = coffes[2];
xFactor[CUBIC_GRID_LEN * ox + 3] = coffes[3];
}
tblAdvance += CUBIC_GRID_LEN * OW;
int *yOrigin = static_cast<int*>(&indexTable[tblAdvance]);
tblAdvance += OH;
float *yFactor = reinterpret_cast<float*>(&indexTable[tblAdvance]);
for (int oy = 0; oy < OH; oy++) {
float iy = coordTransToInput(oy, fy, IH, OH);
int iy_r = static_cast<int>(std::floor(iy));
yOrigin[oy] = iy_r;
float m = iy - iy_r;
std::vector<float> coffes = getCubicCoeffs(m, cubicCoeff);
yFactor[CUBIC_GRID_LEN * oy] = coffes[0];
yFactor[CUBIC_GRID_LEN * oy + 1] = coffes[1];
yFactor[CUBIC_GRID_LEN * oy + 2] = coffes[2];
yFactor[CUBIC_GRID_LEN * oy + 3] = coffes[3];
}
if (layout == InterpolateLayoutType::planar) {
tblAdvance += CUBIC_GRID_LEN * OH;
int *sequenceOH = static_cast<int*>(&indexTable[tblAdvance]);
tblAdvance += OH * OW;
int *sequenceOW = static_cast<int*>(&indexTable[tblAdvance]);
for (int h = 0; h < OH; ++h) {
int offset = h * OW;
for (int w = 0; w < OW; ++w) {
sequenceOH[offset + w] = h * sizeof(int);
sequenceOW[offset + w] = w * sizeof(int);
}
}
}
}
// shapeND: n c d h w
// blockND: ncdhw cdhw dhw hw w 1
// index : 0 1 2 3 4 5
inline SizeVector getBlockND(const SizeVector& shape) {
int shapeRank = shape.size();
SizeVector blockND(shapeRank + 1, 1);
for (int i = shapeRank - 1; i >= 0; i--) {
blockND[i] = shape[i] * blockND[i+1];
}
return blockND;
}
const uint8_t* ov::intel_cpu::InterpolateExecutor::padPreprocess(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst) {
const uint8_t *src_data_origin = reinterpret_cast<uint8_t*>(src[0]->GetData());
const auto &srcDim = src[0]->getStaticDims();
const auto &dstDim = dst[0]->getStaticDims();
size_t dimSize = srcDim.size();
auto srcDimPad = getSrcDimPad5d();
const auto srcDim5d = to5Dim(srcDim);
const auto srcDimPad5d = to5Dim(srcDimPad);
const auto dstDim5d = to5Dim(dstDim);
const auto srcDataSize = src[0]->getDesc().getPrecision().size();
const uint8_t *src_data = nullptr;
std::vector<uint8_t> srcPadded;
if (interpAttrs.hasPad) {
int padB0 = (dimSize > 2) ? interpAttrs.padBegin[0] : 0;
int padB1 = (dimSize > 2) ? interpAttrs.padBegin[1] : 0;
int padB2 = (dimSize == 5) ? interpAttrs.padBegin[dimSize - 3] : 0;
int padB3 = interpAttrs.padBegin[dimSize - 2];
int padB4 = interpAttrs.padBegin[dimSize - 1];
SizeVector inShapeBlock = getBlockND(srcDim5d);
SizeVector inShapePadBlock = getBlockND(srcDimPad5d);
if (interpAttrs.layout == InterpolateLayoutType::planar) {
srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0);
uint8_t *src_data_pad = static_cast<uint8_t *>(&srcPadded[0]);
parallel_for4d(srcDim5d[0], srcDim5d[1], srcDim5d[2], srcDim5d[3], [&](int n, int c, int d, int h) {
const uint8_t *src = src_data_origin + (inShapeBlock[1] * n + inShapeBlock[2] * c + inShapeBlock[3] * d + inShapeBlock[4] * h) * srcDataSize;
uint8_t *srcPad = src_data_pad + (inShapePadBlock[1] * (n + padB0) + inShapePadBlock[2] * (c + padB1) +
inShapePadBlock[3] * (d + padB2) + inShapePadBlock[4] * (h + padB3) + padB4) * srcDataSize;
cpu_memcpy(srcPad, src, srcDim5d[4] * srcDataSize);
});
src_data = src_data_pad;
} else if (interpAttrs.layout == InterpolateLayoutType::by_channel) {
srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0);
uint8_t *src_data_pad = static_cast<uint8_t *>(&srcPadded[0]);
parallel_for4d(srcDim5d[0], srcDim5d[2], srcDim5d[3], srcDim5d[4], [&](int n, int d, int h, int w) {
const uint8_t *src = src_data_origin + (inShapeBlock[1] * n +
(inShapeBlock[3] * d + inShapeBlock[4] * h + inShapeBlock[5] * w) * srcDim5d[1]) * srcDataSize;
uint8_t *srcPad = src_data_pad + (inShapePadBlock[1] * (n + padB0) + (inShapePadBlock[3] * (d + padB2) +
inShapePadBlock[4] * (h + padB3) +
inShapePadBlock[5] * (w + padB4)) * srcDimPad5d[1] + padB1) * srcDataSize;
cpu_memcpy(srcPad, src, srcDim5d[1] * srcDataSize);
});
src_data = src_data_pad;
} else if (interpAttrs.layout == InterpolateLayoutType::block) {
size_t blkSize = dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core) ? 16 : 8;
size_t CB = div_up(srcDimPad5d[1], blkSize);
size_t eltsTotal = srcDimPad5d[0] * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize;
srcPadded.resize(eltsTotal * srcDataSize, 0x0);
uint8_t *src_data_pad = static_cast<uint8_t *>(&srcPadded[0]);
if ((srcDim5d[0] != srcDimPad5d[0]) || (srcDim5d[1] != srcDimPad5d[1])) {
IE_THROW() << "Interpolate layer with name does not support padding on batch and channel dimensions";
}
parallel_for5d(srcDim5d[0], CB, srcDim5d[2], srcDim5d[3], srcDim5d[4], [&](int n, int cb, int d, int h, int w) {
const uint8_t *src = src_data_origin + (n * CB * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize
+ (cb * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize
+ (d * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize
+ (h * srcDim5d[4] * blkSize) * srcDataSize
+ (w * blkSize) * srcDataSize;
uint8_t *srcPad = src_data_pad + (n * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize
+ (cb * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize
+ ((d + padB2) * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize
+ ((h + padB3) * srcDimPad5d[4] * blkSize) * srcDataSize
+ ((w + padB4) * blkSize) * srcDataSize;
cpu_memcpy(srcPad, src, blkSize * srcDataSize);
});
src_data = src_data_pad;
}
} else {
src_data = src_data_origin;
}
return src_data;
}

View File

@ -0,0 +1,187 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ie_common.h>
#include <node.h>
#include <string>
#include <memory>
#include <vector>
#define MAX_INPUT_INTERPOLATE 8
using namespace InferenceEngine;
namespace ov {
namespace intel_cpu {
enum InterpolateLayoutType {
planar,
block,
by_channel
};
enum InterpolateMode {
nearest,
linear,
linear_onnx,
cubic
};
enum InterpolateCoordTransMode {
half_pixel,
pytorch_half_pixel,
asymmetric,
tf_half_pixel_for_nn,
align_corners
};
enum class InterpolateNearestMode {
round_prefer_floor,
round_prefer_ceil,
floor,
ceil,
simple
};
enum class InterpolateShapeCalcMode {
sizes,
scales
};
struct InterpolateAttrs {
InterpolateMode mode = InterpolateMode::nearest;
InterpolateCoordTransMode coordTransMode = InterpolateCoordTransMode::half_pixel;
InterpolateNearestMode nearestMode = InterpolateNearestMode::round_prefer_floor;
bool antialias = false;
float cubeCoeff = -0.75;
std::vector<int> padBegin;
std::vector<int> padEnd;
InferenceEngine::Precision inPrc;
InferenceEngine::Precision outPrc;
InterpolateLayoutType layout;
std::vector<float> dataScales;
bool hasPad = false;
};
inline SizeVector getPaddedInputShape(const VectorDims &srcDims,
const std::vector<int> &padBegin,
const std::vector<int> &padEnd) {
SizeVector paddedShape;
int dataRank = srcDims.size();
for (int i = 0; i < dataRank; i++) {
paddedShape.push_back(srcDims[i] + padBegin[i] + padEnd[i]);
}
return paddedShape;
}
inline int clipCoord(int pos, int length) {
return std::max(static_cast<int>(0), std::min(pos, length - 1));
}
inline size_t getSpatialDimsNum(const Dim rank) {
switch (rank) {
case 1:
case 3:
return 1;
case 2:
case 4:
return 2;
case 5:
return 3;
default:
IE_THROW() << "Can't define number spatial";
}
}
// w/hw/ncw/nchw/ncdhw to ncdhw
inline SizeVector to5Dim(SizeVector casesDim) {
size_t caseSize = casesDim.size();
SizeVector dim5(5, 1lu);
dim5[4] = casesDim[caseSize - 1];
if (caseSize > 1) {
dim5[3] = casesDim[caseSize - 2];
}
if (caseSize > 2) {
dim5[0] = casesDim[0];
}
if (caseSize > 3) {
dim5[1] = casesDim[1];
}
if (caseSize > 4) {
dim5[2] = casesDim[2];
}
if (caseSize == 3) { // nhw -> ncw
dim5[1] = dim5[3];
dim5[3] = 1lu;
}
return dim5;
}
static inline float triangleCoeff(float x) {
return (std::max)(0.0f, 1 - std::abs(x));
}
class InterpolateExecutor {
public:
static constexpr size_t DATA_ID = 0;
static constexpr size_t TARGET_SHAPE_ID = 1;
static constexpr size_t SCALES_ID = 2;
static constexpr size_t AXES_ID = 3;
static constexpr int CUBIC_GRID_LEN = 4;
InterpolateExecutor(const ExecutorContext::CPtr context) : _context(context) {}
virtual bool init(const InterpolateAttrs& interpolateAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const dnnl::primitive_attr &attr);
virtual void exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, const void *post_ops_data_) = 0;
virtual impl_desc_type getImplType() const = 0;
virtual ~InterpolateExecutor() = default;
VectorDims getSrcDimPad5d() const { return srcDimPad5d; }
const uint8_t* padPreprocess(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst);
private:
void buildTblNN(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, const std::vector<float>& dataScales,
InterpolateLayoutType layout, InterpolateNearestMode nearestMode);
void buildTblLinearOnnx(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, const std::vector<float>& dataScales,
InterpolateLayoutType layout);
void buildTblLinear(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, const std::vector<float>& dataScales, int kernel_width,
bool antialias);
void buildTblCubic(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, const std::vector<float>& dataScales, float cubicCoeff,
InterpolateLayoutType layout);
float coordTransToInput(int outCoord, float scale, int inShape, int outShape) const;
int nearestRound(float origin, bool isDownsample, InterpolateNearestMode nearestMode) const;
void linearOnnxCF(int outCoord, float scale, int inShape, int outShape, int& index0, int& index1, float& weight0, float& weight1);
std::vector<float> getCubicCoeffs(float mantissa, float a);
protected:
InterpolateAttrs interpAttrs;
VectorDims srcDimPad5d, dstDim5d;
size_t srcDataSize, dstDataSize;
int spatialDimSize;
size_t dataRank;
std::vector<int> indexTable;
const ExecutorContext::CPtr _context;
};
using InterpolateExecutorPtr = std::shared_ptr<InterpolateExecutor>;
using InterpolateExecutorCPtr = std::shared_ptr<const InterpolateExecutor>;
class InterpolateExecutorBuilder {
public:
~InterpolateExecutorBuilder() = default;
virtual bool isSupported(const InterpolateAttrs& InterpolateAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs) const = 0;
virtual InterpolateExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const = 0;
};
using InterpolateExecutorBuilderPtr = std::shared_ptr<InterpolateExecutorBuilder>;
using InterpolateExecutorBuilderCPtr = std::shared_ptr<const InterpolateExecutorBuilder>;
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,19 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "interpolate_list.hpp"
namespace ov {
namespace intel_cpu {
const std::vector<InterpolateExecutorDesc>& getInterpolateExecutorsList() {
static std::vector<InterpolateExecutorDesc> descs = {
OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared<ACLInterpolateExecutorBuilder>())
};
return descs;
}
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,85 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "executor.hpp"
#include "interpolate.hpp"
#if defined(OV_CPU_WITH_ACL)
#include "acl/acl_interpolate.hpp"
#endif
#include "onednn/iml_type_mapper.h"
#include "common/primitive_cache.hpp"
namespace ov {
namespace intel_cpu {
struct InterpolateExecutorDesc {
ExecutorType executorType;
InterpolateExecutorBuilderCPtr builder;
};
const std::vector<InterpolateExecutorDesc>& getInterpolateExecutorsList();
class InterpolateExecutorFactory : public ExecutorFactory {
public:
InterpolateExecutorFactory(const InterpolateAttrs& InterpolateAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const ExecutorContext::CPtr context) : ExecutorFactory(context) {
for (auto& desc : getInterpolateExecutorsList()) {
if (desc.builder->isSupported(InterpolateAttrs, srcDescs, dstDescs)) {
supportedDescs.push_back(desc);
}
}
}
~InterpolateExecutorFactory() = default;
virtual InterpolateExecutorPtr makeExecutor(const InterpolateAttrs& interpolateAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const dnnl::primitive_attr &attr) {
auto build = [&](const InterpolateExecutorDesc* desc) {
auto executor = desc->builder->makeExecutor(context);
if (executor->init(interpolateAttrs, srcDescs, dstDescs, attr)) {
return executor;
}
InterpolateExecutorPtr ptr = nullptr;
return ptr;
};
if (chosenDesc) {
if (auto executor = build(chosenDesc)) {
return executor;
}
}
for (const auto& sd : supportedDescs) {
if (auto executor = build(&sd)) {
chosenDesc = &sd;
return executor;
}
}
IE_THROW() << "Supported Interpolate executor is not found";
}
bool isEmpty() {
return supportedDescs.empty();
}
private:
std::vector<InterpolateExecutorDesc> supportedDescs;
const InterpolateExecutorDesc* chosenDesc = nullptr;
};
using InterpolateExecutorFactoryPtr = std::shared_ptr<InterpolateExecutorFactory>;
using InterpolateExecutorFactoryCPtr = std::shared_ptr<const InterpolateExecutorFactory>;
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,38 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "mvn.hpp"
namespace ov {
namespace intel_cpu {
using namespace InferenceEngine;
MVNExecutor::MVNExecutor(const ExecutorContext::CPtr context) : context(context) {}
SizeVector MVNExecutor::transformTo5DCase(const SizeVector& shape, bool initAcrossChannels) {
switch (shape.size()) {
// for 1 and 2 rank, if initAcrossChannels_ is true, adjust shape to fully vectorize under unified 5d procedure.
// otherwise there are not enough data in spatial dimension to process in one kernel.
case 1 : // C
if (initAcrossChannels) {
return SizeVector({1, 1, 1, 1, shape[0]});
} else {
return SizeVector({1, shape[0], 1, 1, 1});
}
case 2 : // NC
if (initAcrossChannels) {
return SizeVector({1, shape[0], 1, shape[1], 1});
} else {
return SizeVector({shape[0], shape[1], 1, 1, 1});
}
case 3 : { return SizeVector({shape[0], shape[1], 1, shape[2], 1}); }
case 4 : { return SizeVector({shape[0], shape[1], 1, shape[2], shape[3]}); }
case 5 : { return SizeVector({shape[0], shape[1], shape[2], shape[3], shape[4]}); }
default : { IE_THROW() << "MVN executor doesn't support planar layout with rank: " << shape.size(); }
}
}
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,72 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "cpu_memory.h"
#include "onednn/iml_type_mapper.h"
#include "executor.hpp"
namespace ov {
namespace intel_cpu {
enum MVNLayoutType {
mvn_planar,
mvn_block,
mvn_by_channel
};
// Defines way to add epsilon: inside sqrt or outside.
enum MVNEpsMode {
INSIDE_SQRT,
OUTSIDE_SQRT
};
struct MVNAttrs {
MVNLayoutType layout;
std::tuple<size_t, size_t, size_t, size_t, size_t> shape5D;
bool initAcrossChannels_;
bool execAcrossChannels_;
bool normalizeVariance_;
float epsValue_;
MVNEpsMode epsMode_;
InferenceEngine::Precision src_prc;
InferenceEngine::Precision dst_prc;
};
class MVNExecutor {
public:
MVNExecutor(const ExecutorContext::CPtr context);
virtual bool init(const MVNAttrs& mvnAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const dnnl::primitive_attr &attr) = 0;
virtual void exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, const void *post_ops_data_) = 0;
virtual ~MVNExecutor() = default;
virtual impl_desc_type getImplType() const = 0;
static InferenceEngine::SizeVector transformTo5DCase(const InferenceEngine::SizeVector& shape, bool initAcrossChannels);
protected:
MVNAttrs mvnAttrs;
const ExecutorContext::CPtr context;
};
using MVNExecutorPtr = std::shared_ptr<MVNExecutor>;
using MVNExecutorCPtr = std::shared_ptr<const MVNExecutor>;
class MVNExecutorBuilder {
public:
~MVNExecutorBuilder() = default;
virtual bool isSupported(const MVNAttrs& mvnAttrs, const std::vector<MemoryDescPtr>& srcDescs, const std::vector<MemoryDescPtr>& dstDescs) const = 0;
virtual MVNExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const = 0;
};
using MVNExecutorBuilderPtr = std::shared_ptr<MVNExecutorBuilder>;
using MVNExecutorBuilderCPtr = std::shared_ptr<const MVNExecutorBuilder>;
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,19 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "mvn_list.hpp"
namespace ov {
namespace intel_cpu {
const std::vector<MVNExecutorDesc>& getMVNExecutorsList() {
static std::vector<MVNExecutorDesc> descs = {
OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared<AclMVNExecutorBuilder>())
};
return descs;
}
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,84 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "executor.hpp"
#include "mvn.hpp"
#if defined(OV_CPU_WITH_ACL)
#include "acl/acl_mvn.hpp"
#endif
#include "onednn/iml_type_mapper.h"
#include "common/primitive_cache.hpp"
namespace ov {
namespace intel_cpu {
struct MVNExecutorDesc {
ExecutorType executorType;
MVNExecutorBuilderCPtr builder;
};
const std::vector<MVNExecutorDesc>& getMVNExecutorsList();
class MVNExecutorFactory : public ExecutorFactory {
public:
MVNExecutorFactory(const MVNAttrs& mvnAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const ExecutorContext::CPtr context) : ExecutorFactory(context) {
for (auto& desc : getMVNExecutorsList()) {
if (desc.builder->isSupported(mvnAttrs, srcDescs, dstDescs)) {
supportedDescs.push_back(desc);
}
}
}
~MVNExecutorFactory() = default;
virtual MVNExecutorPtr makeExecutor(const MVNAttrs& mvnAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const dnnl::primitive_attr &attr) {
auto build = [&](const MVNExecutorDesc* desc) {
auto executor = desc->builder->makeExecutor(context);
if (executor->init(mvnAttrs, srcDescs, dstDescs, attr)) {
return executor;
}
MVNExecutorPtr ptr = nullptr;
return ptr;
};
if (chosenDesc) {
if (auto executor = build(chosenDesc)) {
return executor;
}
}
for (const auto& sd : supportedDescs) {
if (auto executor = build(&sd)) {
chosenDesc = &sd;
return executor;
}
}
IE_THROW() << "Supported MVN executor is not found";
}
bool isEmpty() {
return supportedDescs.empty();
}
private:
std::vector<MVNExecutorDesc> supportedDescs;
const MVNExecutorDesc* chosenDesc = nullptr;
};
using MVNExecutorFactoryPtr = std::shared_ptr<MVNExecutorFactory>;
using MVNExecutorFactoryCPtr = std::shared_ptr<const MVNExecutorFactory>;
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,15 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "pooling.hpp"
namespace ov {
namespace intel_cpu {
using namespace InferenceEngine;
PoolingExecutor::PoolingExecutor(const ExecutorContext::CPtr context) : context(context) {}
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,75 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "cpu_memory.h"
#include "onednn/iml_type_mapper.h"
#include "executor.hpp"
namespace ov {
namespace intel_cpu {
struct PoolingAttrs {
bool exclude_pad = false;
bool auto_pad = false;
op::PadType pad_type;
Algorithm algorithm;
op::RoundingType rounding;
std::vector<ptrdiff_t> stride;
std::vector<ptrdiff_t> kernel;
std::vector<ptrdiff_t> dilation;
std::vector<ptrdiff_t> data_pad_begin;
std::vector<ptrdiff_t> data_pad_end;
/// Effective padding. Used to define correct output shape by oneDNN
/// reshape formula: (iw - kernel + pad_l + pad_r) / strides[i - 2] + 1
/// should be passed into pooling desc constructor.
std::vector<ptrdiff_t> effective_pad_begin;
std::vector<ptrdiff_t> effective_pad_end;
/// Effective dilation. Used to define correct dilation for OneDNN.
/// For OneDNN default dilation is vector of zero
std::vector<ptrdiff_t> effective_dilation;
};
class PoolingExecutor {
public:
PoolingExecutor(const ExecutorContext::CPtr context);
virtual bool init(const PoolingAttrs& poolingAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const dnnl::primitive_attr &attr) = 0;
virtual void exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, std::unordered_map<int, MemoryPtr> postOpsArgs) = 0;
virtual ~PoolingExecutor() = default;
virtual impl_desc_type getImplType() const = 0;
protected:
PoolingAttrs poolingAttrs;
const ExecutorContext::CPtr context;
};
using PoolingExecutorPtr = std::shared_ptr<PoolingExecutor>;
using PoolingExecutorCPtr = std::shared_ptr<const PoolingExecutor>;
class PoolingExecutorBuilder {
public:
~PoolingExecutorBuilder() = default;
virtual bool isSupported(const PoolingAttrs& poolingAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs) const = 0;
virtual PoolingExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const = 0;
};
using PoolingExecutorBuilderPtr = std::shared_ptr<PoolingExecutorBuilder>;
using PoolingExecutorBuilderCPtr = std::shared_ptr<const PoolingExecutorBuilder>;
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,19 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "pooling_list.hpp"
namespace ov {
namespace intel_cpu {
const std::vector<PoolingExecutorDesc>& getPoolingExecutorsList() {
static std::vector<PoolingExecutorDesc> descs = {
OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared<AclPoolingExecutorBuilder>())
};
return descs;
}
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,78 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "executor.hpp"
#include "pooling.hpp"
#if defined(OV_CPU_WITH_ACL)
#include "acl/acl_pooling.hpp"
#endif
namespace ov {
namespace intel_cpu {
struct PoolingExecutorDesc {
ExecutorType executorType;
PoolingExecutorBuilderCPtr builder;
};
const std::vector<PoolingExecutorDesc>& getPoolingExecutorsList();
class PoolingExecutorFactory : public ExecutorFactory {
public:
PoolingExecutorFactory(const PoolingAttrs& poolingAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const ExecutorContext::CPtr context) : ExecutorFactory(context) {
for (auto& desc : getPoolingExecutorsList()) {
if (desc.builder->isSupported(poolingAttrs, srcDescs, dstDescs)) {
supportedDescs.push_back(desc);
}
}
}
~PoolingExecutorFactory() = default;
virtual PoolingExecutorPtr makeExecutor(const PoolingAttrs& poolingAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const dnnl::primitive_attr &attr) {
auto build = [&](const PoolingExecutorDesc* desc) {
auto executor = desc->builder->makeExecutor(context);
if (executor->init(poolingAttrs, srcDescs, dstDescs, attr)) {
return executor;
}
PoolingExecutorPtr ptr = nullptr;
return ptr;
};
if (chosenDesc) {
if (auto executor = build(chosenDesc)) {
return executor;
}
}
for (const auto& sd : supportedDescs) {
if (auto executor = build(&sd)) {
chosenDesc = &sd;
return executor;
}
}
IE_THROW() << "Supported Pooling executor is not found";
}
private:
std::vector<PoolingExecutorDesc> supportedDescs;
const PoolingExecutorDesc* chosenDesc = nullptr;
};
using PoolingExecutorFactoryPtr = std::shared_ptr<PoolingExecutorFactory>;
using PoolingExecutorFactoryCPtr = std::shared_ptr<const PoolingExecutorFactory>;
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,15 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "reduce.hpp"
namespace ov {
namespace intel_cpu {
using namespace InferenceEngine;
ReduceExecutor::ReduceExecutor(const ExecutorContext::CPtr context) : context(context) {}
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,55 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "cpu_memory.h"
#include "onednn/iml_type_mapper.h"
#include "dnnl_scratch_pad.h"
#include "executor.hpp"
namespace ov {
namespace intel_cpu {
struct ReduceAttrs {
std::vector<int> axes;
Algorithm operation;
bool keepDims;
};
class ReduceExecutor {
public:
ReduceExecutor(const ExecutorContext::CPtr context);
virtual bool init(const ReduceAttrs& reduceAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const dnnl::primitive_attr &attr) = 0;
virtual void exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, const void *post_ops_data_) = 0;
virtual ~ReduceExecutor() = default;
virtual impl_desc_type getImplType() const = 0;
protected:
ReduceAttrs reduceAttrs;
const ExecutorContext::CPtr context;
};
using ReduceExecutorPtr = std::shared_ptr<ReduceExecutor>;
using ReduceExecutorCPtr = std::shared_ptr<const ReduceExecutor>;
class ReduceExecutorBuilder {
public:
~ReduceExecutorBuilder() = default;
virtual bool isSupported(const ReduceAttrs& reduceAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs) const = 0;
virtual ReduceExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const = 0;
};
using ReduceExecutorBuilderPtr = std::shared_ptr<ReduceExecutorBuilder>;
using ReduceExecutorBuilderCPtr = std::shared_ptr<const ReduceExecutorBuilder>;
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,19 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "reduce_list.hpp"
namespace ov {
namespace intel_cpu {
const std::vector<ReduceExecutorDesc>& getReduceExecutorsList() {
static std::vector<ReduceExecutorDesc> descs = {
OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared<AclReduceExecutorBuilder>())
};
return descs;
}
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,85 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "executor.hpp"
#include "reduce.hpp"
#if defined(OV_CPU_WITH_ACL)
#include "acl/acl_reduce.hpp"
#endif
#include "onednn/iml_type_mapper.h"
#include "common/primitive_cache.hpp"
namespace ov {
namespace intel_cpu {
struct ReduceExecutorDesc {
ExecutorType executorType;
ReduceExecutorBuilderCPtr builder;
};
const std::vector<ReduceExecutorDesc>& getReduceExecutorsList();
class ReduceExecutorFactory : public ExecutorFactory {
public:
ReduceExecutorFactory(const ReduceAttrs& reduceAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const ExecutorContext::CPtr context) : ExecutorFactory(context) {
for (auto& desc : getReduceExecutorsList()) {
if (desc.builder->isSupported(reduceAttrs, srcDescs, dstDescs)) {
supportedDescs.push_back(desc);
}
}
}
~ReduceExecutorFactory() = default;
virtual ReduceExecutorPtr makeExecutor(const ReduceAttrs& reduceAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const dnnl::primitive_attr &attr) {
auto build = [&](const ReduceExecutorDesc* desc) {
auto executor = desc->builder->makeExecutor(context);
if (executor->init(reduceAttrs, srcDescs, dstDescs, attr)) {
return executor;
}
ReduceExecutorPtr ptr = nullptr;
return ptr;
};
if (chosenDesc) {
if (auto executor = build(chosenDesc)) {
return executor;
}
}
for (const auto& sd : supportedDescs) {
if (auto executor = build(&sd)) {
chosenDesc = &sd;
return executor;
}
}
IE_THROW() << "Supported Reduce executor is not found";
}
bool isEmpty() {
return supportedDescs.empty();
}
private:
std::vector<ReduceExecutorDesc> supportedDescs;
const ReduceExecutorDesc* chosenDesc = nullptr;
};
using ReduceExecutorFactoryPtr = std::shared_ptr<ReduceExecutorFactory>;
using ReduceExecutorFactoryCPtr = std::shared_ptr<const ReduceExecutorFactory>;
} // namespace intel_cpu
} // namespace ov

View File

@ -25,7 +25,7 @@ using namespace Xbyak;
namespace ov {
namespace intel_cpu {
namespace node {
#if defined(OPENVINO_ARCH_X86_64)
#define GET_OFF(field) offsetof(jit_extract_image_patches_args, field)
template <cpu_isa_t isa>
@ -270,6 +270,7 @@ private:
dd(i * jpp.SW * jpp.dtype_size);
}
};
#endif // OPENVINO_ARCH_X86_64
bool ExtractImagePatches::isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
try {
@ -481,6 +482,7 @@ void ExtractImagePatches::ExtractImagePatchesRefExecutor::executeReference(
void ExtractImagePatches::ExtractImagePatchesJitExecutor::executeOptimizedGeneric(
void* src, void* dst, const VectorDims& istrides, const VectorDims& ostrides) const {
#if defined(OPENVINO_ARCH_X86_64)
const char* src_data = reinterpret_cast<const char*>(src);
char* dst_data = reinterpret_cast<char*>(dst);
const auto& jpp = pKernel->jpp;
@ -507,6 +509,7 @@ void ExtractImagePatches::ExtractImagePatchesJitExecutor::executeOptimizedGeneri
args.w_hi_pad = iw_hpad;
(*pKernel)(&args);
});
#endif // OPENVINO_ARCH_X86_64
}
jit_extract_image_patches_params ExtractImagePatches::ExtractImagePatchesExecutor::fillJpp(
@ -585,6 +588,7 @@ ExtractImagePatches::ExtractImagePatchesJitExecutor::ExtractImagePatchesJitExecu
const VectorDims& rates,
const ExtImgPatcherPadType& padType,
const size_t prcSize) {
#if defined(OPENVINO_ARCH_X86_64)
auto jpp = fillJpp(inDims, outDims, kSizes, strides, rates, padType, prcSize);
if (mayiuse(x64::avx512_core)) {
pKernel.reset(new jit_extract_image_patches_kernel<x64::avx512_core>(jpp));
@ -598,6 +602,7 @@ ExtractImagePatches::ExtractImagePatchesJitExecutor::ExtractImagePatchesJitExecu
if (pKernel)
pKernel->create_ker();
#endif // OPENVINO_ARCH_X86_64
}
void ExtractImagePatches::ExtractImagePatchesJitExecutor::exec(

View File

@ -45,7 +45,7 @@ using namespace Xbyak;
namespace ov {
namespace intel_cpu {
namespace node {
#if defined(OPENVINO_ARCH_X86_64)
#define GET_OFF(field) offsetof(jit_quantize_call_args, field)
template <cpu_isa_t isa>
@ -228,7 +228,7 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_
};
void generate() override {
do_dequantization = jqp_.op_type == Algorithm::FQCommon || jqp_.op_type == Algorithm::FQRequantization;
do_dequantization = jqp_.op_type == Algorithm::FQCommon;
do_rounding = do_dequantization || jqp_.dst_prc == Precision::FP32;
this->preamble();
@ -863,7 +863,7 @@ private:
}
}
};
#endif
bool FakeQuantize::isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
try {
const auto fq = std::dynamic_pointer_cast<const ngraph::opset1::FakeQuantize>(op);
@ -1236,8 +1236,7 @@ FakeQuantize::FakeQuantize(const std::shared_ptr<ngraph::Node>& op, const GraphC
}
}
algorithm = quantizationOnly ? Algorithm::FQQuantization :
(isFakeQuantization || isFakeQuantizationWithScale) ? Algorithm::FQCommon : Algorithm::FQRequantization;
algorithm = quantizationOnly ? Algorithm::FQQuantization : Algorithm::FQCommon;
}
} else {
IE_THROW(NotImplemented) << errorMessage;
@ -1326,7 +1325,6 @@ void FakeQuantize::initSupportedPrimitiveDescriptors() {
} else {
impl_type = impl_desc_type::ref;
}
if (!mayiuse(cpu::x64::sse41) || getAxis() != 1) {
impl_type = impl_desc_type::ref;
@ -1597,8 +1595,8 @@ void FakeQuantize::executeReference() {
});
}
}
void FakeQuantize::executeBinarization(const std::unique_ptr<jit_uni_quantize_kernel> &pKernel) const {
#if defined(OPENVINO_ARCH_X86_64)
const auto &srcMemory = getParentEdgeAt(0)->getMemoryPtr();
auto &dstMemory = getChildEdgeAt(0)->getMemoryPtr();
@ -1636,9 +1634,11 @@ void FakeQuantize::executeBinarization(const std::unique_ptr<jit_uni_quantize_ke
(*pKernel)(&arg);
});
#endif
}
void FakeQuantize::executeQuantization(const std::unique_ptr<jit_uni_quantize_kernel> &pKernel) const {
#if defined(OPENVINO_ARCH_X86_64)
auto &srcMemory = getParentEdgeAt(0)->getMemoryPtr();
auto &dstMemory = getChildEdgeAt(0)->getMemoryPtr();
@ -1761,6 +1761,7 @@ void FakeQuantize::executeQuantization(const std::unique_ptr<jit_uni_quantize_ke
(*pKernel)(&arg);
});
}
#endif
}
void FakeQuantize::executeDynamicImpl(dnnl::stream strm) {
@ -2111,6 +2112,7 @@ bool FakeQuantize::appendAttrPostOps(DnnlPostOpsComposer& dnnlpoc,
}
FakeQuantize::FakeQuantizeJitExecutor::FakeQuantizeJitExecutor(const jit_quantize_params &_jqp) {
#if defined(OPENVINO_ARCH_X86_64)
bool isBinarization = _jqp.op_type == Algorithm::FQBinarization;
if (mayiuse(cpu::x64::avx512_core)) {
if (isBinarization)
@ -2133,6 +2135,7 @@ FakeQuantize::FakeQuantizeJitExecutor::FakeQuantizeJitExecutor(const jit_quantiz
if (pKernel) {
pKernel->create_ker();
}
#endif
}
void FakeQuantize::FakeQuantizeJitExecutor::exec(const FakeQuantize& node) {

View File

@ -166,13 +166,11 @@ private:
};
using executorPtr = std::shared_ptr<FakeQuantizeExecutor>;
executorPtr execPtr = nullptr;
struct FakeQuantizeJitExecutor : public FakeQuantizeExecutor {
FakeQuantizeJitExecutor(const jit_quantize_params &_jqp);
void exec(const FakeQuantize& node) override;
std::unique_ptr<jit_uni_quantize_kernel> pKernel;
};
void init() override;
std::vector<LayoutType> getDataFormats() const;
void initializePostOpData(const VectorDims &postOpDims, const size_t bufferAlignment, bool doRounding);

View File

@ -9,7 +9,7 @@
#include "fake_quantize.h"
#include "input.h"
#include "reorder.h"
#include "ngraph_transformations/op/fully_connected.hpp"
#include "transformations/cpu_opset/common/op/fully_connected.hpp"
#include "ngraph/opsets/opset1.hpp"
#include "dnnl_extension_utils.h"
#include "onednn/dnnl.h"

View File

@ -10,7 +10,7 @@
#include <ngraph/opsets/opset1.hpp>
#include "common/cpu_memcpy.h"
#include <utils/general_utils.h>
#include "kernels/gather_uni_kernel.hpp"
#include "kernels/x64/gather_uni_kernel.hpp"
#include "utils/shape_inference/shape_inference_cpu.hpp"
using namespace InferenceEngine;
@ -205,6 +205,7 @@ void Gather::initSupportedPrimitiveDescriptors() {
}
void Gather::createPrimitive() {
#if defined(OPENVINO_ARCH_X86_64)
uint64_t idxElPerVec = 1;
if (!isDynamicNode()) {
idxElPerVec = x64::mayiuse(x64::avx512_core) ? x64::cpu_isa_traits<x64::avx512_core>::vlen / idxTypeSize :
@ -269,7 +270,7 @@ void Gather::createPrimitive() {
}
}
}
#endif
Node::createPrimitive();
}
@ -323,6 +324,7 @@ void Gather::prepareParams() {
totalWork = beforeBatchSize * betweenBatchAndAxisSize * specIndicesSize * afterAxisSize;
}
#if defined(OPENVINO_ARCH_X86_64)
const auto& selectedPD = getSelectedPrimitiveDescriptor();
if (jitKernel && jitKernel->isSupportedConfiguration(afterAxisSize)) {
if (x64::mayiuse(x64::avx512_core)) {
@ -330,12 +332,12 @@ void Gather::prepareParams() {
} else if (x64::mayiuse(x64::avx2)) {
selectedPD->setImplementationType(jit_avx2);
}
} else {
selectedPD->setImplementationType(ref_any);
}
#endif
}
void Gather::execute(dnnl::stream strm) {
#if defined(OPENVINO_ARCH_X86_64)
if (jitKernel && jitKernel->isSupportedConfiguration(afterAxisSize)) {
const void* srcIndices = getParentEdgeAt(GATHER_INDICES)->getMemoryPtr()->GetPtr();
const void* srcData = getParentEdgeAt(GATHER_DATA)->getMemoryPtr()->GetPtr();
@ -383,12 +385,15 @@ void Gather::execute(dnnl::stream strm) {
};
parallel_nt(0, threadBody);
} else {
execReference();
return;
}
#endif
execReference();
}
void Gather::executeDynamicImpl(dnnl::stream strm) {
#if defined(OPENVINO_ARCH_X86_64)
if (jitKernel && jitKernel->isSupportedConfiguration(afterAxisSize)) {
const void* srcIndices = getParentEdgeAt(GATHER_INDICES)->getMemoryPtr()->GetPtr();
const void* srcData = getParentEdgeAt(GATHER_DATA)->getMemoryPtr()->GetPtr();
@ -442,9 +447,11 @@ void Gather::executeDynamicImpl(dnnl::stream strm) {
};
parallel_nt(0, threadBody);
} else {
execReference();
return;
}
#endif
execReference();
}
void Gather::initShortParams(threadExecParams& p, const uint64_t start) {

View File

@ -5,7 +5,7 @@
#pragma once
#include <node.h>
#include "kernels/gather_uni_kernel.hpp"
#include "kernels/x64/gather_uni_kernel.hpp"
#include <memory>
#include <string>

View File

@ -5,7 +5,7 @@
#pragma once
#include <node.h>
#include "kernels/grid_sample.hpp"
#include "kernels/x64/grid_sample.hpp"
#include <memory>
#include <string>

View File

@ -33,8 +33,9 @@ using namespace Xbyak;
namespace ov {
namespace intel_cpu {
namespace node {
namespace {
#if defined(OPENVINO_ARCH_X86_64)
namespace {
struct jit_has_subnormals_base : public jit_generator {
DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_has_subnormals_base)
@ -229,6 +230,7 @@ jit_has_subnormals_base::fn_t jit_has_subnormals_function() {
}
} // namespace
#endif
Input::Input(const std::shared_ptr<ngraph::Node>& op, const GraphContext::CPtr context)
: Node(op, context, PassThroughShapeInferFactory()) {
@ -297,6 +299,7 @@ void Input::cloneBlobIfRequired() {
if (!size)
return false;
#if defined(OPENVINO_ARCH_X86_64)
if (auto fn = jit_has_subnormals_function()) {
static const size_t batch_size = 2048;
const size_t iterations_num = size / batch_size + 1;
@ -318,11 +321,12 @@ void Input::cloneBlobIfRequired() {
});
return has_subnormals;
} else {
for (size_t i = 0; i < size; ++i) {
if (u32data[i] && (u32data[i] & (0xFF << 23)) == 0) {
return true;
}
}
#endif
for (size_t i = 0; i < size; ++i) {
if (u32data[i] && (u32data[i] & (0xFF << 23)) == 0) {
return true;
}
}
}

View File

@ -6,7 +6,7 @@
#include <string>
#include <vector>
#include "ngraph_transformations/op/interaction.hpp"
#include "transformations/cpu_opset/x64/op/interaction.hpp"
#include "interaction.h"
#include <onednn/dnnl.h>
#include <dnnl_extension_utils.h>
@ -18,8 +18,8 @@
#include <ie_ngraph_utils.hpp>
#include <cpu/x64/cpu_isa_traits.hpp>
#include <cpu/x64/jit_generator.hpp>
#include "emitters/jit_dnnl_emitters.hpp"
#include "emitters/jit_load_store_emitters.hpp"
#include "emitters/x64/jit_dnnl_emitters.hpp"
#include "emitters/x64/jit_load_store_emitters.hpp"
using namespace InferenceEngine;
using namespace dnnl::impl::cpu::x64;

View File

@ -20,8 +20,8 @@
#include <cpu/x64/injectors/jit_uni_eltwise_injector.hpp>
#include "common/cpu_memcpy.h"
#include "utils/bfloat16.hpp"
#include "emitters/jit_bf16_emitters.hpp"
#include "emitters/jit_load_store_emitters.hpp"
#include "emitters/x64/jit_bf16_emitters.hpp"
#include "emitters/x64/jit_load_store_emitters.hpp"
#include <ngraph/opsets/opset1.hpp>
#include <ngraph/opsets/opset4.hpp>
@ -46,6 +46,8 @@ namespace ov {
namespace intel_cpu {
namespace node {
#if defined(OPENVINO_ARCH_X86_64)
template <cpu_isa_t isa>
struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, public jit_generator {
DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_interpolate_kernel_f32)
@ -1364,9 +1366,11 @@ private:
}
};
#endif // OPENVINO_ARCH_X86_64
namespace {
struct InterpolateKey {
Interpolate::InterpolateAttrs nodeAttrs;
InterpolateAttrs nodeAttrs;
VectorDims srcDims;
VectorDims dstDims;
std::vector<float> dataScales;
@ -1548,7 +1552,7 @@ bool Interpolate::isSupportedOperation(const std::shared_ptr<const ngraph::Node>
namespace {
/**
* Interpolate shape inference factory. It defines the input mask depending on the shape calculation mode.
*
*
*/
class InterpolateShapeInferFactory : public ShapeInferFactory {
public:
@ -1769,7 +1773,7 @@ void Interpolate::initSupportedPrimitiveDescriptors() {
auto axesType = Precision::I32;
auto& creatorsMap = BlockedDescCreator::getCommonCreators();
auto pushDesc = [&](LayoutType dataFormat, impl_desc_type implDetail) {
auto pushDesc = [&](LayoutType dataFormat, impl_desc_type implDetail, bool useAclExecutor = false) {
config.inConfs[DATA_ID].setMemDesc(creatorsMap.at(dataFormat)->createSharedDesc(inputPrecision, getInputShapeAtPort(DATA_ID)));
config.inConfs[TARGET_SHAPE_ID].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc(targetShapeType, getInputShapeAtPort(TARGET_SHAPE_ID)));
config.inConfs[SCALES_ID].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc(scalesType, getInputShapeAtPort(SCALES_ID)));
@ -1778,12 +1782,39 @@ void Interpolate::initSupportedPrimitiveDescriptors() {
config.inConfs[AXES_ID].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc(axesType, getInputShapeAtPort(AXES_ID)));
config.outConfs[0].setMemDesc(creatorsMap.at(dataFormat)->createSharedDesc(outputPrecision, getOutputShapeAtPort(0)));
supportedPrimitiveDescriptors.push_back({config, implDetail});
if (useAclExecutor) {
std::vector<MemoryDescPtr> srcMemoryDescs;
for (int i = 0; i < config.inConfs.size(); i++) {
srcMemoryDescs.push_back(config.inConfs[i].getMemDesc());
}
std::vector<MemoryDescPtr> dstMemoryDescs;
for (int i = 0; i < config.outConfs.size(); i++) {
dstMemoryDescs.push_back(config.outConfs[i].getMemDesc());
}
auto factory = std::make_shared<InterpolateExecutorFactory>(interpAttrs, srcMemoryDescs, dstMemoryDescs,
std::make_shared<ExecutorContext>(context, getPrimitivesPriority()));
if (!factory->isEmpty()) {
supportedPrimitiveDescriptors.push_back({config, implDetail, factory});
}
} else {
supportedPrimitiveDescriptors.push_back({config, implDetail});
}
};
const auto &dataMinDims = getInputShapeAtPort(DATA_ID).getMinDims();
bool isBlkApplied = getInputShapeAtPort(DATA_ID).getRank() > 1 && dataMinDims[1] != Shape::UNDEFINED_DIM && dataMinDims[1] > 1;
#if defined (OV_CPU_WITH_ACL)
interpAttrs.hasPad = hasPad;
pushDesc(LayoutType::nspc, undef, true);
pushDesc(LayoutType::ncsp, undef, true);
canUseAclExecutor = !supportedPrimitiveDescriptors.empty();
if (canUseAclExecutor)
return;
#endif
if (!mayiuse(cpu::x64::sse41) || interpAttrs.mode == InterpolateMode::linear) {
pushDesc(LayoutType::ncsp, ref);
} else {
@ -1897,11 +1928,28 @@ void Interpolate::prepareParams() {
IE_THROW() << "Interpolate layer only supports resize on spatial dimensions(depth, height and width)";
}
if (canUseAclExecutor) {
interpAttrs.dataScales = dataScales;
std::vector<MemoryDescPtr> srcMemoryDescs;
for (int i = 0; i < getParentEdges().size(); i++) {
srcMemoryDescs.push_back(getParentEdgeAt(i)->getMemoryPtr()->getDescPtr());
}
std::vector<MemoryDescPtr> dstMemoryDescs;
dstMemoryDescs.push_back(getChildEdgeAt(0)->getMemoryPtr()->getDescPtr());
auto selectedPD = getSelectedPrimitiveDescriptor();
aclExecPtr = selectedPD->getExecutorFactoryAs<InterpolateExecutorFactory>()->makeExecutor(interpAttrs, srcMemoryDescs, dstMemoryDescs, {});
selectedPD->setImplementationType(aclExecPtr->getImplType());
return;
}
InterpolateKey key = {interpAttrs, srcDims, dstDims, dataScales, dnnl::primitive_attr()};
setPostOps(key.attr, dstDims);
auto buildExecutor = [&](const InterpolateKey& key) -> std::shared_ptr<InterpolateExecutor> {
std::shared_ptr<InterpolateExecutor> executor;
auto buildExecutor = [&](const InterpolateKey& key) -> std::shared_ptr<InterpolateExecutorBase> {
std::shared_ptr<InterpolateExecutorBase> executor;
if ((key.nodeAttrs.mode == InterpolateMode::nearest || key.nodeAttrs.mode == InterpolateMode::linear_onnx ||
key.nodeAttrs.mode == InterpolateMode::cubic) &&
((key.nodeAttrs.layout != InterpolateLayoutType::planar && mayiuse(cpu::x64::sse41)) ||
@ -2013,89 +2061,92 @@ std::vector<float> Interpolate::getScales(const VectorDims &srcDimPad, const Vec
}
void Interpolate::execute(dnnl::stream strm) {
if (!execPtr) {
IE_THROW() << "Can't execute Interpolate node. Primitive didn't created";
}
auto &dstMemPtr = getChildEdgeAt(0)->getMemoryPtr();
auto &srcMemPtr = getParentEdgeAt(DATA_ID)->getMemoryPtr();
uint8_t *dst_data = reinterpret_cast<uint8_t*>(dstMemPtr->GetPtr());
const uint8_t *src_data_origin = reinterpret_cast<uint8_t*>(srcMemPtr->GetData());
if (execPtr) {
uint8_t *dst_data = reinterpret_cast<uint8_t*>(dstMemPtr->GetPtr());
const uint8_t *src_data_origin = reinterpret_cast<uint8_t*>(srcMemPtr->GetData());
const auto &srcDim = srcMemPtr->getStaticDims();
const auto &dstDim = dstMemPtr->getStaticDims();
size_t dimSize = srcDim.size();
auto srcDimPad = execPtr->getSrcDimPad5d();
const auto &srcDim = srcMemPtr->getStaticDims();
const auto &dstDim = dstMemPtr->getStaticDims();
size_t dimSize = srcDim.size();
auto srcDimPad = execPtr->getSrcDimPad5d();
const auto srcDim5d = to5Dim(srcDim);
const auto srcDimPad5d = to5Dim(srcDimPad);
const auto dstDim5d = to5Dim(dstDim);
const auto srcDataSize = srcMemPtr->getDesc().getPrecision().size();
const auto srcDim5d = to5Dim(srcDim);
const auto srcDimPad5d = to5Dim(srcDimPad);
const auto dstDim5d = to5Dim(dstDim);
const auto srcDataSize = srcMemPtr->getDesc().getPrecision().size();
const uint8_t *src_data = nullptr;
std::vector<uint8_t> srcPadded;
if (hasPad) {
int padB0 = (dimSize > 2) ? interpAttrs.padBegin[0] : 0;
int padB1 = (dimSize > 2) ? interpAttrs.padBegin[1] : 0;
int padB2 = (dimSize == 5) ? interpAttrs.padBegin[dimSize - 3] : 0;
int padB3 = interpAttrs.padBegin[dimSize - 2];
int padB4 = interpAttrs.padBegin[dimSize - 1];
const uint8_t *src_data = nullptr;
std::vector<uint8_t> srcPadded;
if (hasPad) {
int padB0 = (dimSize > 2) ? interpAttrs.padBegin[0] : 0;
int padB1 = (dimSize > 2) ? interpAttrs.padBegin[1] : 0;
int padB2 = (dimSize == 5) ? interpAttrs.padBegin[dimSize - 3] : 0;
int padB3 = interpAttrs.padBegin[dimSize - 2];
int padB4 = interpAttrs.padBegin[dimSize - 1];
SizeVector inShapeBlock = getBlockND(srcDim5d);
SizeVector inShapePadBlock = getBlockND(srcDimPad5d);
SizeVector inShapeBlock = getBlockND(srcDim5d);
SizeVector inShapePadBlock = getBlockND(srcDimPad5d);
if (interpAttrs.layout == InterpolateLayoutType::planar) {
srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0);
uint8_t *src_data_pad = static_cast<uint8_t *>(&srcPadded[0]);
parallel_for4d(srcDim5d[0], srcDim5d[1], srcDim5d[2], srcDim5d[3], [&](int n, int c, int d, int h) {
const uint8_t *src = src_data_origin + (inShapeBlock[1] * n + inShapeBlock[2] * c + inShapeBlock[3] * d + inShapeBlock[4] * h) * srcDataSize;
uint8_t *srcPad = src_data_pad + (inShapePadBlock[1] * (n + padB0) + inShapePadBlock[2] * (c + padB1) +
inShapePadBlock[3] * (d + padB2) + inShapePadBlock[4] * (h + padB3) + padB4) * srcDataSize;
cpu_memcpy(srcPad, src, srcDim5d[4] * srcDataSize);
});
src_data = src_data_pad;
} else if (interpAttrs.layout == InterpolateLayoutType::by_channel) {
srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0);
uint8_t *src_data_pad = static_cast<uint8_t *>(&srcPadded[0]);
parallel_for4d(srcDim5d[0], srcDim5d[2], srcDim5d[3], srcDim5d[4], [&](int n, int d, int h, int w) {
const uint8_t *src = src_data_origin + (inShapeBlock[1] * n +
(inShapeBlock[3] * d + inShapeBlock[4] * h + inShapeBlock[5] * w) * srcDim5d[1]) * srcDataSize;
uint8_t *srcPad = src_data_pad + (inShapePadBlock[1] * (n + padB0) + (inShapePadBlock[3] * (d + padB2) +
inShapePadBlock[4] * (h + padB3) + inShapePadBlock[5] * (w + padB4)) * srcDimPad5d[1] + padB1) * srcDataSize;
cpu_memcpy(srcPad, src, srcDim5d[1] * srcDataSize);
});
src_data = src_data_pad;
} else if (interpAttrs.layout == InterpolateLayoutType::block) {
size_t blkSize = mayiuse(cpu::x64::avx512_core) ? 16 : 8;
size_t CB = div_up(srcDimPad5d[1], blkSize);
size_t eltsTotal = srcDimPad5d[0] * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize;
srcPadded.resize(eltsTotal * srcDataSize, 0x0);
uint8_t *src_data_pad = static_cast<uint8_t *>(&srcPadded[0]);
if ((srcDim5d[0] != srcDimPad5d[0]) || (srcDim5d[1] != srcDimPad5d[1])) {
IE_THROW() << "Interpolate layer with name '" << getName() <<
"' does not support padding on batch and channel dimensions";
if (interpAttrs.layout == InterpolateLayoutType::planar) {
srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0);
uint8_t *src_data_pad = static_cast<uint8_t *>(&srcPadded[0]);
parallel_for4d(srcDim5d[0], srcDim5d[1], srcDim5d[2], srcDim5d[3], [&](int n, int c, int d, int h) {
const uint8_t *src = src_data_origin +
(inShapeBlock[1] * n + inShapeBlock[2] * c + inShapeBlock[3] * d + inShapeBlock[4] * h) * srcDataSize;
uint8_t *srcPad = src_data_pad + (inShapePadBlock[1] * (n + padB0) + inShapePadBlock[2] * (c + padB1) +
inShapePadBlock[3] * (d + padB2) + inShapePadBlock[4] * (h + padB3) + padB4) * srcDataSize;
cpu_memcpy(srcPad, src, srcDim5d[4] * srcDataSize);
});
src_data = src_data_pad;
} else if (interpAttrs.layout == InterpolateLayoutType::by_channel) {
srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0);
uint8_t *src_data_pad = static_cast<uint8_t *>(&srcPadded[0]);
parallel_for4d(srcDim5d[0], srcDim5d[2], srcDim5d[3], srcDim5d[4], [&](int n, int d, int h, int w) {
const uint8_t *src = src_data_origin + (inShapeBlock[1] * n +
(inShapeBlock[3] * d + inShapeBlock[4] * h + inShapeBlock[5] * w) * srcDim5d[1]) * srcDataSize;
uint8_t *srcPad = src_data_pad + (inShapePadBlock[1] * (n + padB0) + (inShapePadBlock[3] * (d + padB2) +
inShapePadBlock[4] * (h + padB3) + inShapePadBlock[5] * (w + padB4)) * srcDimPad5d[1] + padB1) * srcDataSize;
cpu_memcpy(srcPad, src, srcDim5d[1] * srcDataSize);
});
src_data = src_data_pad;
} else if (interpAttrs.layout == InterpolateLayoutType::block) {
size_t blkSize = mayiuse(cpu::x64::avx512_core) ? 16 : 8;
size_t CB = div_up(srcDimPad5d[1], blkSize);
size_t eltsTotal = srcDimPad5d[0] * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize;
srcPadded.resize(eltsTotal * srcDataSize, 0x0);
uint8_t *src_data_pad = static_cast<uint8_t *>(&srcPadded[0]);
if ((srcDim5d[0] != srcDimPad5d[0]) || (srcDim5d[1] != srcDimPad5d[1])) {
IE_THROW() << "Interpolate layer with name '" << getName() <<
"' does not support padding on batch and channel dimensions";
}
parallel_for5d(srcDim5d[0], CB, srcDim5d[2], srcDim5d[3], srcDim5d[4], [&](int n, int cb, int d, int h, int w) {
const uint8_t *src = src_data_origin + (n * CB * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize
+ (cb * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize
+ (d * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize
+ (h * srcDim5d[4] * blkSize) * srcDataSize
+ (w * blkSize) * srcDataSize;
uint8_t *srcPad = src_data_pad + (n * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize
+ (cb * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize
+ ((d + padB2) * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize
+ ((h + padB3) * srcDimPad5d[4] * blkSize) * srcDataSize
+ ((w + padB4) * blkSize) * srcDataSize;
cpu_memcpy(srcPad, src, blkSize * srcDataSize);
});
src_data = src_data_pad;
}
parallel_for5d(srcDim5d[0], CB, srcDim5d[2], srcDim5d[3], srcDim5d[4], [&](int n, int cb, int d, int h, int w) {
const uint8_t *src = src_data_origin + (n * CB * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize
+ (cb * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize
+ (d * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize
+ (h * srcDim5d[4] * blkSize) * srcDataSize
+ (w * blkSize) * srcDataSize;
uint8_t *srcPad = src_data_pad + (n * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize
+ (cb * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize
+ ((d + padB2) * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize
+ ((h + padB3) * srcDimPad5d[4] * blkSize) * srcDataSize
+ ((w + padB4) * blkSize) * srcDataSize;
cpu_memcpy(srcPad, src, blkSize * srcDataSize);
});
src_data = src_data_pad;
} else {
src_data = src_data_origin;
}
} else {
src_data = src_data_origin;
}
execPtr->exec(src_data, dst_data, postOpsDataPtrs.data());
execPtr->exec(src_data, dst_data, postOpsDataPtrs.data());
} else if (aclExecPtr) {
aclExecPtr->exec({srcMemPtr}, {dstMemPtr}, postOpsDataPtrs.data());
} else {
IE_THROW() << "Can't execute Interpolate node. Primitive didn't created";
}
}
// for ndhwc and nCdhw8c[16c]
@ -2369,7 +2420,7 @@ void Interpolate::InterpolateJitExecutor::cubicPlanar(const uint8_t *in_ptr_, ui
// =====================================================================================================================
// index layout:
// d_0............d_OD-1, h_0..............h_OH-1, w_0................w_OW-1
void Interpolate::InterpolateExecutor::buildTblNN(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d,
void Interpolate::InterpolateExecutorBase::buildTblNN(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d,
const std::vector<float>& dataScales, InterpolateLayoutType layout, InterpolateNearestMode nearestMode) {
const int dimSize = dataRank;
float fz = (dimSize == 5) ? dataScales[dimSize - 3] : 1.f;
@ -2402,7 +2453,7 @@ void Interpolate::InterpolateExecutor::buildTblNN(const SizeVector& srcDimPad5d,
// scale is float(outShape) / float(inShape)
// strictly consistent with onnx calc manner(div scale, not multiply inverse), given this is done offline
// the slight precison diff can produce obvious wrong value due to "nearest round" behavior for NN mode
float Interpolate::InterpolateExecutor::coordTransToInput(int outCoord, float scale, int inShape, int outShape) const {
float Interpolate::InterpolateExecutorBase::coordTransToInput(int outCoord, float scale, int inShape, int outShape) const {
if (scale == 1.0f || (inShape == outShape)) {
return outCoord;
}
@ -2440,7 +2491,7 @@ float Interpolate::InterpolateExecutor::coordTransToInput(int outCoord, float sc
}
}
int Interpolate::InterpolateExecutor::nearestRound(float originCoord, bool isDownsample, InterpolateNearestMode nearestMode) const {
int Interpolate::InterpolateExecutorBase::nearestRound(float originCoord, bool isDownsample, InterpolateNearestMode nearestMode) const {
switch (nearestMode) {
case InterpolateNearestMode::round_prefer_floor: {
if (originCoord == (static_cast<int>(originCoord) + 0.5f))
@ -2474,7 +2525,7 @@ int Interpolate::InterpolateExecutor::nearestRound(float originCoord, bool isDow
}
}
void Interpolate::InterpolateExecutor::linearOnnxCF(int outCoord, float scale, int inShape, int outShape,
void Interpolate::InterpolateExecutorBase::linearOnnxCF(int outCoord, float scale, int inShape, int outShape,
int& index0, int& index1, float& weight0, float& weight1) {
float inCoord = coordTransToInput(outCoord, scale, inShape, outShape);
inCoord = std::max(0.0f, std::min(inCoord, static_cast<float>(inShape - 1)));
@ -2489,7 +2540,7 @@ void Interpolate::InterpolateExecutor::linearOnnxCF(int outCoord, float scale, i
}
}
void Interpolate::InterpolateExecutor::buildTblLinearOnnx(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d,
void Interpolate::InterpolateExecutorBase::buildTblLinearOnnx(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d,
const std::vector<float>& dataScales, InterpolateLayoutType layout) {
int dimSize = dataRank;
float fz = (spatialDimSize > 2) ? dataScales[dimSize - 3] : 1.f;
@ -2602,7 +2653,7 @@ void Interpolate::InterpolateExecutor::buildTblLinearOnnx(const SizeVector& srcD
// wd .........wd, wh............wh, ww.............ww, id...........id, ih............ih, iw..............iw
// | |
// wh0.....wh_diameter ih0.....ih_diameter
void Interpolate::InterpolateExecutor::buildTblLinear(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d,
void Interpolate::InterpolateExecutorBase::buildTblLinear(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d,
const std::vector<float>& dataScales, int kernel_width, bool antialias) {
int dimSize = dataRank;
float fz = (dimSize == 5) ? dataScales[dimSize - 3] : 1.f;
@ -2679,7 +2730,7 @@ void Interpolate::InterpolateExecutor::buildTblLinear(const SizeVector& srcDimPa
}
}
std::vector<float> Interpolate::InterpolateExecutor::getCubicCoeffs(float mantissa, float a) {
std::vector<float> Interpolate::InterpolateExecutorBase::getCubicCoeffs(float mantissa, float a) {
float m = std::fabs(mantissa);
std::vector<float> coeffs(4, 0.f);
@ -2693,7 +2744,7 @@ std::vector<float> Interpolate::InterpolateExecutor::getCubicCoeffs(float mantis
// table layout:
// OW OW OW OW OW OH OH OH OH OH
// x_idx x_weight0 x_weight1 x_weight2 x_weight3 y_idx y_weight0 y_weight1 y_weight2 y_weight3
void Interpolate::InterpolateExecutor::buildTblCubic(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, const std::vector<float>& dataScales,
void Interpolate::InterpolateExecutorBase::buildTblCubic(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, const std::vector<float>& dataScales,
float cubicCoeff, InterpolateLayoutType layout) {
int dimSize = dataRank;
float fy = dataScales[dimSize - 2];
@ -3085,7 +3136,7 @@ void Interpolate::InterpolateRefExecutor::linearInterpolation(const uint8_t *in_
});
}
Interpolate::InterpolateExecutor::InterpolateExecutor(const InterpolateAttrs& interpAttrs,
Interpolate::InterpolateExecutorBase::InterpolateExecutorBase(const InterpolateAttrs& interpAttrs,
const VectorDims &srcDims,
const VectorDims &dstDims,
const std::vector<float> &dataScales) :
@ -3128,7 +3179,7 @@ Interpolate::InterpolateJitExecutor::InterpolateJitExecutor(const InterpolateAtt
const VectorDims &dstDims,
const std::vector<float> &dataScales,
const dnnl::primitive_attr &attr) :
InterpolateExecutor(interpAttrs, srcDims, dstDims, dataScales) {
InterpolateExecutorBase(interpAttrs, srcDims, dstDims, dataScales) {
auto jcp = jit_interpolate_config_params();
jcp.mode = mode;
jcp.src_prc = interpAttrs.inPrc;
@ -3145,6 +3196,7 @@ Interpolate::InterpolateJitExecutor::InterpolateJitExecutor(const InterpolateAtt
jcp.ID = srcDimPad5d[2];
jcp.spatial_dim_size = getSpatialDimsNum(srcDims.size());
jcp.layout = interpAttrs.layout;
#if defined(OPENVINO_ARCH_X86_64)
if (jcp.layout != InterpolateLayoutType::planar) {
if (mayiuse(cpu::x64::avx512_core)) {
interpolateKernel.reset(new jit_uni_interpolate_kernel_f32<cpu::x64::avx512_core>(jcp, *attr.get()));
@ -3159,6 +3211,7 @@ Interpolate::InterpolateJitExecutor::InterpolateJitExecutor(const InterpolateAtt
} else {
IE_THROW() << "Can't create InterpolateJitExecutor";
}
#endif // OPENVINO_ARCH_X86_64
if (interpolateKernel) {
interpolateKernel->create_ker();
} else {
@ -3266,4 +3319,4 @@ bool Interpolate::created() const {
} // namespace node
} // namespace intel_cpu
} // namespace ov
} // namespace ov

View File

@ -9,6 +9,8 @@
#include <string>
#include <memory>
#include <vector>
#include "executors/interpolate.hpp"
#include "executors/interpolate_list.hpp"
#define MAX_INPUT_INTERPOLATE 8
@ -18,40 +20,6 @@ namespace ov {
namespace intel_cpu {
namespace node {
enum InterpolateLayoutType {
planar,
block,
by_channel
};
enum InterpolateMode {
nearest,
linear,
linear_onnx,
cubic
};
enum InterpolateCoordTransMode {
half_pixel,
pytorch_half_pixel,
asymmetric,
tf_half_pixel_for_nn,
align_corners
};
enum class InterpolateNearestMode {
round_prefer_floor,
round_prefer_ceil,
floor,
ceil,
simple
};
enum class InterpolateShapeCalcMode {
sizes,
scales
};
struct jit_interpolate_config_params {
InterpolateLayoutType layout;
InterpolateMode mode;
@ -121,31 +89,18 @@ public:
bool needPrepareParams() const override;
void prepareParams() override;
struct InterpolateAttrs {
InterpolateMode mode = InterpolateMode::nearest;
InterpolateCoordTransMode coordTransMode = InterpolateCoordTransMode::half_pixel;
InterpolateNearestMode nearestMode = InterpolateNearestMode::round_prefer_floor;
bool antialias = false;
float cubeCoeff = -0.75;
std::vector<int> padBegin;
std::vector<int> padEnd;
InferenceEngine::Precision inPrc;
InferenceEngine::Precision outPrc;
InterpolateLayoutType layout;
};
private:
InterpolateAttrs interpAttrs;
class InterpolateExecutor {
class InterpolateExecutorBase {
public:
InterpolateExecutor(const InterpolateAttrs& interpAttrs,
InterpolateExecutorBase(const InterpolateAttrs& interpAttrs,
const VectorDims &srcDims,
const VectorDims &dstDims,
const std::vector<float> &dataScales);
virtual void exec(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_) = 0;
virtual ~InterpolateExecutor() = default;
virtual ~InterpolateExecutorBase() = default;
VectorDims getSrcDimPad5d() const { return srcDimPad5d; }
private:
@ -174,9 +129,9 @@ private:
size_t dataRank;
std::vector<int> indexTable;
};
std::shared_ptr<InterpolateExecutor> execPtr = nullptr;
std::shared_ptr<InterpolateExecutorBase> execPtr = nullptr;
class InterpolateJitExecutor : public InterpolateExecutor {
class InterpolateJitExecutor : public InterpolateExecutorBase {
public:
InterpolateJitExecutor(const InterpolateAttrs& interpAttrs,
const VectorDims &srcDims,
@ -209,13 +164,13 @@ private:
std::shared_ptr<jit_uni_interpolate_kernel> interpolateKernel = nullptr;
};
class InterpolateRefExecutor : public InterpolateExecutor {
class InterpolateRefExecutor : public InterpolateExecutorBase {
public:
InterpolateRefExecutor(const InterpolateAttrs& interpAttrs,
const VectorDims &srcDims,
const VectorDims &dstDims,
const std::vector<float> &_dataScales) :
InterpolateExecutor(interpAttrs, srcDims, dstDims, _dataScales),
InterpolateExecutorBase(interpAttrs, srcDims, dstDims, _dataScales),
antialias(interpAttrs.antialias), dataScales(_dataScales) {}
void exec(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_) override;
@ -259,8 +214,11 @@ private:
VectorDims lastOutputDims;
std::string errorPrefix;
bool canUseAclExecutor = false;
std::shared_ptr<InterpolateExecutor> aclExecPtr = nullptr;
};
} // namespace node
} // namespace intel_cpu
} // namespace ov
} // namespace ov

View File

@ -4,7 +4,7 @@
#pragma once
#include <cpu/x64/jit_generator.hpp>
#include <emitters/jit_load_store_emitters.hpp>
#include "emitters/x64/jit_load_store_emitters.hpp"
#include <ie/ie_precision.hpp>
#include <common/nstl.hpp>
#include <type_traits>

Some files were not shown because too many files have changed in this diff Show More