[CPU] ARM architecture support (#15256)
* [CPU] ARM architecture support This patch extends existing CPU plugin capabilities with ARM CPUs optimized support
This commit is contained in:
parent
a368e10fff
commit
c283d21215
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -69,3 +69,6 @@
|
||||
[submodule "thirdparty/snappy"]
|
||||
path = thirdparty/snappy
|
||||
url = https://github.com/google/snappy.git
|
||||
[submodule "ARMComputeLibrary"]
|
||||
path = src/plugins/intel_cpu/thirdparty/ComputeLibrary
|
||||
url = https://github.com/ARM-software/ComputeLibrary.git
|
||||
|
@ -6,7 +6,7 @@
|
||||
# Common cmake options
|
||||
#
|
||||
|
||||
ie_dependent_option (ENABLE_INTEL_CPU "CPU plugin for OpenVINO Runtime" ON "RISCV64 OR X86 OR X86_64" OFF)
|
||||
ie_dependent_option (ENABLE_INTEL_CPU "CPU plugin for OpenVINO Runtime" ON "RISCV64 OR X86 OR X86_64 OR AARCH64" OFF)
|
||||
|
||||
ie_option (ENABLE_TESTS "unit, behavior and functional tests" OFF)
|
||||
|
||||
|
@ -38,6 +38,43 @@ if(ENABLE_TESTS)
|
||||
add_subdirectory(tests)
|
||||
endif()
|
||||
|
||||
add_definitions(-DOV_CPU_WITH_DNNL)
|
||||
set(OV_CPU_WITH_DNNL ON)
|
||||
|
||||
if(DNNL_AARCH64_USE_ACL)
|
||||
add_definitions(-DOV_CPU_WITH_ACL)
|
||||
set(OV_CPU_WITH_ACL ON)
|
||||
endif()
|
||||
|
||||
if(OV_CPU_WITH_ACL)
|
||||
set(CMAKE_CXX_STANDARD 14)
|
||||
endif()
|
||||
|
||||
# remove target specific files from compilation
|
||||
|
||||
if (NOT OV_CPU_WITH_ACL)
|
||||
set(EXCLUDE_PATHS ${EXCLUDE_PATHS}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/executors/acl/*)
|
||||
endif()
|
||||
|
||||
if (NOT X86_64)
|
||||
set(EXCLUDE_PATHS ${EXCLUDE_PATHS}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/executors/x64/*
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/kernels/x64/*
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/emitters/x64/*
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/cpu_opset/x64/*
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/snippets/x64/*)
|
||||
endif()
|
||||
|
||||
if (NOT AARCH64)
|
||||
set(EXCLUDE_PATHS ${EXCLUDE_PATHS}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/cpu_opset/arm/*)
|
||||
endif()
|
||||
|
||||
file(GLOB_RECURSE FILES_TO_REMOVE ${EXCLUDE_PATHS})
|
||||
list(REMOVE_ITEM SOURCES ${FILES_TO_REMOVE})
|
||||
list(REMOVE_ITEM HEADERS ${FILES_TO_REMOVE})
|
||||
|
||||
# create plugin
|
||||
|
||||
ie_add_plugin(NAME ${TARGET_NAME}
|
||||
|
@ -27,4 +27,4 @@ CPU Plugin contains the following components:
|
||||
* [OpenVINO™ README](../../../README.md)
|
||||
* [OpenVINO Core Components](../../README.md)
|
||||
* [OpenVINO Plugins](../README.md)
|
||||
* [Developer documentation](../../../docs/dev/index.md)
|
||||
* [Developer documentation](../../../docs/dev/index.md)
|
||||
|
@ -79,6 +79,8 @@ MultiCache::EntryPtr<KeyType, ValueType> MultiCache::getEntry() {
|
||||
return std::static_pointer_cast<EntryType>(itr->second);
|
||||
}
|
||||
|
||||
using MultiCacheWeakPtr = std::weak_ptr<MultiCache>;
|
||||
using MultiCacheWeakCPtr = std::weak_ptr<const MultiCache>;
|
||||
using MultiCachePtr = std::shared_ptr<MultiCache>;
|
||||
using MultiCacheCPtr = std::shared_ptr<const MultiCache>;
|
||||
|
||||
|
@ -263,6 +263,11 @@ void Config::readProperties(const std::map<std::string, std::string> &prop) {
|
||||
if (exclusiveAsyncRequests) // Exclusive request feature disables the streams
|
||||
streamExecutorConfig._streams = 1;
|
||||
|
||||
#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
|
||||
// TODO: multi-stream execution has functional issues on ARM target
|
||||
streamExecutorConfig._streams = 1;
|
||||
#endif
|
||||
|
||||
CPU_DEBUG_CAP_ENABLE(applyDebugCapsProperties());
|
||||
updateProperties();
|
||||
}
|
||||
|
@ -48,7 +48,12 @@ struct Config {
|
||||
std::string device_id = {};
|
||||
int batchLimit = 0;
|
||||
float fcSparseWeiDecompressionRate = 1.0f;
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
size_t rtCacheCapacity = 5000ul;
|
||||
#else
|
||||
// TODO: Executor cache may leads to incorrect behavior on oneDNN ACL primitives
|
||||
size_t rtCacheCapacity = 0ul;
|
||||
#endif
|
||||
InferenceEngine::IStreamsExecutor::Config streamExecutorConfig;
|
||||
InferenceEngine::PerfHintsConfig perfHintsConfig;
|
||||
bool enableCpuPinning = true;
|
||||
|
@ -194,7 +194,7 @@ public:
|
||||
}
|
||||
|
||||
enum : Dim {
|
||||
UNDEFINED_DIM = 0xffffffffffffffff
|
||||
UNDEFINED_DIM = std::numeric_limits<Dim>::max()
|
||||
};
|
||||
|
||||
private:
|
||||
|
@ -9,9 +9,6 @@
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
using Dim = std::size_t;
|
||||
using VectorDims = std::vector<Dim>;
|
||||
|
||||
const InferenceEngine::details::caseless_unordered_map<std::string, Type> type_to_name_tbl = {
|
||||
{ "Constant", Type::Input },
|
||||
{ "Parameter", Type::Input },
|
||||
@ -69,6 +66,7 @@ const InferenceEngine::details::caseless_unordered_map<std::string, Type> type_t
|
||||
{ "SoftPlus", Type::Eltwise },
|
||||
{ "SoftSign", Type::Eltwise },
|
||||
{ "Select", Type::Eltwise},
|
||||
{ "Log", Type::Eltwise },
|
||||
{ "Reshape", Type::Reshape },
|
||||
{ "Squeeze", Type::Reshape },
|
||||
{ "Unsqueeze", Type::Reshape },
|
||||
@ -163,7 +161,6 @@ const InferenceEngine::details::caseless_unordered_map<std::string, Type> type_t
|
||||
{ "Floor", Type::Math},
|
||||
{ "HardSigmoid", Type::Math},
|
||||
{ "If", Type::If},
|
||||
{ "Log", Type::Math},
|
||||
{ "Neg", Type::Math},
|
||||
{ "Reciprocal", Type::Math},
|
||||
{ "Selu", Type::Math},
|
||||
@ -448,7 +445,8 @@ std::string algToString(const Algorithm alg) {
|
||||
CASE(EltwiseLogicalXor);
|
||||
CASE(EltwiseLogicalNot);
|
||||
CASE(EltwiseRelu);
|
||||
CASE(EltwiseGelu);
|
||||
CASE(EltwiseGeluErf);
|
||||
CASE(EltwiseGeluTanh);
|
||||
CASE(EltwiseElu);
|
||||
CASE(EltwiseTanh);
|
||||
CASE(EltwiseSelect);
|
||||
@ -466,10 +464,10 @@ std::string algToString(const Algorithm alg) {
|
||||
CASE(EltwiseRoundHalfToEven);
|
||||
CASE(EltwiseRoundHalfAwayFromZero);
|
||||
CASE(EltwiseErf);
|
||||
CASE(EltwiseLog);
|
||||
CASE(FQCommon);
|
||||
CASE(FQQuantization);
|
||||
CASE(FQBinarization);
|
||||
CASE(FQRequantization);
|
||||
CASE(ROIPoolingMax);
|
||||
CASE(ROIPoolingBilinear);
|
||||
CASE(ROIAlignMax);
|
||||
@ -502,7 +500,6 @@ std::string algToString(const Algorithm alg) {
|
||||
CASE(MathErf);
|
||||
CASE(MathFloor);
|
||||
CASE(MathHardSigmoid);
|
||||
CASE(MathLog);
|
||||
CASE(MathNegative);
|
||||
CASE(MathReciprocal);
|
||||
CASE(MathSelu);
|
||||
|
@ -160,7 +160,8 @@ enum class Algorithm {
|
||||
EltwiseLogicalXor,
|
||||
EltwiseLogicalNot,
|
||||
EltwiseRelu,
|
||||
EltwiseGelu,
|
||||
EltwiseGeluErf,
|
||||
EltwiseGeluTanh,
|
||||
EltwiseElu,
|
||||
EltwiseTanh,
|
||||
EltwiseSigmoid,
|
||||
@ -179,12 +180,12 @@ enum class Algorithm {
|
||||
EltwiseRoundHalfAwayFromZero,
|
||||
EltwiseErf,
|
||||
EltwiseSoftSign,
|
||||
EltwiseLog,
|
||||
|
||||
// FakeQuantize algorithms
|
||||
FQCommon,
|
||||
FQQuantization,
|
||||
FQBinarization,
|
||||
FQRequantization,
|
||||
|
||||
// ROIPooling algorithms
|
||||
ROIPoolingMax,
|
||||
@ -227,7 +228,6 @@ enum class Algorithm {
|
||||
MathErf,
|
||||
MathFloor,
|
||||
MathHardSigmoid,
|
||||
MathLog,
|
||||
MathNegative,
|
||||
MathReciprocal,
|
||||
MathSelu,
|
||||
|
@ -203,5 +203,80 @@ const char* DnnlExtensionUtils::query_pd_info(const_dnnl_primitive_desc_t pd) {
|
||||
return pd->info();
|
||||
}
|
||||
|
||||
dnnl::algorithm DnnlExtensionUtils::convertToDnnlAlgorithm(Algorithm alg) {
|
||||
switch (alg) {
|
||||
case Algorithm::EltwiseRelu: return dnnl::algorithm::eltwise_relu;
|
||||
case Algorithm::EltwiseTanh: return dnnl::algorithm::eltwise_tanh;
|
||||
case Algorithm::EltwiseElu: return dnnl::algorithm::eltwise_elu;
|
||||
case Algorithm::EltwiseAbs: return dnnl::algorithm::eltwise_abs;
|
||||
case Algorithm::EltwiseSqrt: return dnnl::algorithm::eltwise_sqrt;
|
||||
case Algorithm::EltwiseSwish: return dnnl::algorithm::eltwise_swish;
|
||||
case Algorithm::EltwiseHswish: return dnnl::algorithm::eltwise_hardswish;
|
||||
case Algorithm::EltwiseSoftRelu: return dnnl::algorithm::eltwise_soft_relu;
|
||||
case Algorithm::EltwiseMish: return dnnl::algorithm::eltwise_mish;
|
||||
case Algorithm::EltwiseExp: return dnnl::algorithm::eltwise_exp;
|
||||
case Algorithm::EltwiseGeluErf: return dnnl::algorithm::eltwise_gelu_erf;
|
||||
case Algorithm::EltwiseGeluTanh: return dnnl::algorithm::eltwise_gelu_tanh;
|
||||
case Algorithm::EltwiseSigmoid: return dnnl::algorithm::eltwise_logistic;
|
||||
case Algorithm::EltwiseClamp: return dnnl::algorithm::eltwise_clip;
|
||||
case Algorithm::EltwisePowerStatic: return dnnl::algorithm::eltwise_pow;
|
||||
case Algorithm::EltwiseHsigmoid: return dnnl::algorithm::eltwise_hsigmoid;
|
||||
case Algorithm::EltwiseRoundHalfToEven: return dnnl::algorithm::eltwise_round_half_to_even;
|
||||
case Algorithm::EltwiseRoundHalfAwayFromZero: return dnnl::algorithm::eltwise_round_half_away_from_zero;
|
||||
case Algorithm::EltwiseAdd: return dnnl::algorithm::binary_add;
|
||||
case Algorithm::EltwiseMultiply: return dnnl::algorithm::binary_mul;
|
||||
case Algorithm::EltwiseSubtract: return dnnl::algorithm::binary_sub;
|
||||
case Algorithm::EltwiseDivide: return dnnl::algorithm::binary_div;
|
||||
case Algorithm::EltwiseMaximum: return dnnl::algorithm::binary_max;
|
||||
case Algorithm::EltwiseMinimum: return dnnl::algorithm::binary_min;
|
||||
case Algorithm::EltwiseEqual: return dnnl::algorithm::binary_eq;
|
||||
case Algorithm::EltwiseNotEqual: return dnnl::algorithm::binary_ne;
|
||||
case Algorithm::EltwiseGreater: return dnnl::algorithm::binary_gt;
|
||||
case Algorithm::EltwiseGreaterEqual: return dnnl::algorithm::binary_ge;
|
||||
case Algorithm::EltwiseLess: return dnnl::algorithm::binary_lt;
|
||||
case Algorithm::EltwiseLessEqual: return dnnl::algorithm::binary_le;
|
||||
case Algorithm::EltwisePrelu: return dnnl::algorithm::binary_prelu;
|
||||
case Algorithm::ReduceMax: return dnnl::algorithm::reduction_max;
|
||||
case Algorithm::ReduceMin: return dnnl::algorithm::reduction_min;
|
||||
case Algorithm::ReduceSum: return dnnl::algorithm::reduction_sum;
|
||||
case Algorithm::ReduceMean: return dnnl::algorithm::reduction_mean;
|
||||
case Algorithm::FQCommon: return dnnl::algorithm::quantization_quantize_dequantize;
|
||||
case Algorithm::FQQuantization: return dnnl::algorithm::quantization_quantize;
|
||||
case Algorithm::FQBinarization: return dnnl::algorithm::binarization_depthwise;
|
||||
default: return dnnl::algorithm::undef;
|
||||
}
|
||||
}
|
||||
|
||||
bool DnnlExtensionUtils::isUnarySupportedAsPostOp(Algorithm alg) {
|
||||
#if defined(OV_CPU_WITH_ACL)
|
||||
return one_of(alg, Algorithm::EltwiseRelu,
|
||||
Algorithm::EltwiseTanh,
|
||||
Algorithm::EltwiseElu,
|
||||
Algorithm::EltwiseAbs,
|
||||
Algorithm::EltwiseSqrt,
|
||||
Algorithm::EltwiseSoftRelu,
|
||||
Algorithm::EltwiseSigmoid);
|
||||
#elif defined(OPENVINO_ARCH_X86_64)
|
||||
return one_of(alg, Algorithm::EltwiseRelu,
|
||||
Algorithm::EltwiseGeluErf,
|
||||
Algorithm::EltwiseGeluTanh,
|
||||
Algorithm::EltwiseElu,
|
||||
Algorithm::EltwiseSigmoid,
|
||||
Algorithm::EltwiseClamp,
|
||||
Algorithm::EltwiseTanh,
|
||||
Algorithm::EltwiseSwish,
|
||||
Algorithm::EltwiseHswish,
|
||||
Algorithm::EltwiseMish,
|
||||
Algorithm::EltwiseHsigmoid,
|
||||
Algorithm::EltwiseRoundHalfToEven,
|
||||
Algorithm::EltwiseRoundHalfAwayFromZero,
|
||||
Algorithm::EltwiseAbs,
|
||||
Algorithm::EltwiseSqrt,
|
||||
Algorithm::EltwiseSoftRelu);
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
||||
|
@ -57,6 +57,8 @@ public:
|
||||
static bool hasProperImplementationType(dnnl::primitive_desc& desc, impl_desc_type implType);
|
||||
static dnnl_memory_desc_t clone_desc(const_dnnl_memory_desc_t cdesc);
|
||||
static const char* query_pd_info(const_dnnl_primitive_desc_t pd);
|
||||
static dnnl::algorithm convertToDnnlAlgorithm(Algorithm alg);
|
||||
static bool isUnarySupportedAsPostOp(Algorithm alg);
|
||||
};
|
||||
|
||||
} // namespace intel_cpu
|
||||
|
@ -15,12 +15,13 @@
|
||||
#include "jit_dnnl_ext_emitters.hpp"
|
||||
#include "jit_conversion_emitters.hpp"
|
||||
|
||||
#include "snippets_transformations/op/load_convert.hpp"
|
||||
#include "snippets_transformations/op/store_convert.hpp"
|
||||
#include "snippets_transformations/op/fused_mul_add.hpp"
|
||||
#include "snippets_transformations/op/brgemm_copy_b.hpp"
|
||||
#include "snippets_transformations/op/brgemm_cpu.hpp"
|
||||
#include "ngraph_transformations/op/swish_cpu.hpp"
|
||||
#include "transformations/snippets/x64/op/load_convert.hpp"
|
||||
#include "transformations/snippets/x64/op/store_convert.hpp"
|
||||
#include "transformations/snippets/x64/op/fused_mul_add.hpp"
|
||||
#include "transformations/snippets/x64/op/brgemm_copy_b.hpp"
|
||||
#include "transformations/snippets/x64/op/brgemm_cpu.hpp"
|
||||
#include "snippets/op/brgemm.hpp"
|
||||
#include "transformations/cpu_opset/common/op/swish_cpu.hpp"
|
||||
|
||||
#include <ngraph/opsets/opset5.hpp>
|
||||
|
@ -5,7 +5,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "ngraph/opsets/opset5.hpp"
|
||||
#include "ngraph_transformations/op/swish_cpu.hpp"
|
||||
#include "transformations/cpu_opset/common/op/swish_cpu.hpp"
|
||||
#include "jit_dnnl_emitters.hpp"
|
||||
|
||||
namespace ov {
|
@ -8,8 +8,8 @@
|
||||
#include "jit_snippets_emitters.hpp"
|
||||
#include "snippets/op/subgraph.hpp"
|
||||
#include "snippets/utils.hpp"
|
||||
#include "snippets_transformations/op/brgemm_copy_b.hpp"
|
||||
#include "snippets_transformations/op/brgemm_cpu.hpp"
|
||||
#include "transformations/snippets/x64/op/brgemm_copy_b.hpp"
|
||||
#include "transformations/snippets/x64/op//brgemm_cpu.hpp"
|
||||
|
||||
using namespace InferenceEngine;
|
||||
using ngraph::snippets::op::Subgraph;
|
@ -10,7 +10,7 @@
|
||||
#include "jit_emitter.hpp"
|
||||
#include "jit_load_store_emitters.hpp"
|
||||
|
||||
#include "snippets_transformations/op/store_convert.hpp"
|
||||
#include "transformations/snippets/x64/op/store_convert.hpp"
|
||||
// Matmul support:
|
||||
#include <cpu/x64/brgemm/brgemm.hpp>
|
||||
#include <cpu/x64/matmul/brgemm_matmul_copy_utils.hpp>
|
@ -3,17 +3,17 @@
|
||||
//
|
||||
|
||||
#include "extension.h"
|
||||
#include "ngraph_transformations/op/fully_connected.hpp"
|
||||
#include "ngraph_transformations/op/interaction.hpp"
|
||||
#include "ngraph_transformations/op/leaky_relu.hpp"
|
||||
#include "ngraph_transformations/op/power_static.hpp"
|
||||
#include "ngraph_transformations/op/swish_cpu.hpp"
|
||||
#include "ngraph_transformations/op/mha.hpp"
|
||||
#include "ngraph_transformations/op/ngram.hpp"
|
||||
#include "snippets_transformations/op/load_convert.hpp"
|
||||
#include "snippets_transformations/op/store_convert.hpp"
|
||||
#include "snippets_transformations/op/brgemm_cpu.hpp"
|
||||
#include "snippets_transformations/op/brgemm_copy_b.hpp"
|
||||
#include "transformations/cpu_opset/common/op/fully_connected.hpp"
|
||||
#include "transformations/cpu_opset/common/op/leaky_relu.hpp"
|
||||
#include "transformations/cpu_opset/common/op/power_static.hpp"
|
||||
#include "transformations/cpu_opset/common/op/swish_cpu.hpp"
|
||||
#include "transformations/cpu_opset/common/op/ngram.hpp"
|
||||
#include "transformations/cpu_opset/x64/op/mha.hpp"
|
||||
#include "transformations/cpu_opset/x64/op/interaction.hpp"
|
||||
#include "transformations/snippets/x64/op/load_convert.hpp"
|
||||
#include "transformations/snippets/x64/op/store_convert.hpp"
|
||||
#include "transformations/snippets/x64/op/brgemm_cpu.hpp"
|
||||
#include "transformations/snippets/x64/op/brgemm_copy_b.hpp"
|
||||
|
||||
#include <ngraph/ngraph.hpp>
|
||||
#include <ov_ops/augru_cell.hpp>
|
||||
@ -46,20 +46,20 @@ std::map<std::string, ngraph::OpSet> Extension::getOpSets() {
|
||||
auto cpu_plugin_opset = []() {
|
||||
ngraph::OpSet opset;
|
||||
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
#define NGRAPH_OP_X64(NAME, NAMESPACE) NGRAPH_OP(NAME, NAMESPACE)
|
||||
#else
|
||||
#define NGRAPH_OP_X64(NAME, NAMESPACE)
|
||||
#endif
|
||||
|
||||
#define NGRAPH_OP(NAME, NAMESPACE) opset.insert<NAMESPACE::NAME>();
|
||||
NGRAPH_OP(InteractionNode, ov::intel_cpu)
|
||||
NGRAPH_OP(FullyConnectedNode, ov::intel_cpu)
|
||||
NGRAPH_OP(LeakyReluNode, ov::intel_cpu)
|
||||
NGRAPH_OP(PowerStaticNode, ov::intel_cpu)
|
||||
NGRAPH_OP(SwishNode, ov::intel_cpu)
|
||||
NGRAPH_OP(MHANode, ov::intel_cpu)
|
||||
NGRAPH_OP(NgramNode, ov::intel_cpu)
|
||||
NGRAPH_OP(LoadConvertSaturation, ov::intel_cpu)
|
||||
NGRAPH_OP(LoadConvertTruncation, ov::intel_cpu)
|
||||
NGRAPH_OP(StoreConvertSaturation, ov::intel_cpu)
|
||||
NGRAPH_OP(StoreConvertTruncation, ov::intel_cpu)
|
||||
NGRAPH_OP(BrgemmCPU, ov::intel_cpu)
|
||||
NGRAPH_OP(BrgemmCopyB, ov::intel_cpu)
|
||||
NGRAPH_OP_X64(MHANode, ov::intel_cpu)
|
||||
NGRAPH_OP_X64(InteractionNode, ov::intel_cpu)
|
||||
#undef NGRAPH_OP
|
||||
|
||||
return opset;
|
||||
@ -157,6 +157,12 @@ std::map<std::string, ngraph::OpSet> Extension::getOpSets() {
|
||||
NGRAPH_OP(Store, ngraph::snippets::op)
|
||||
NGRAPH_OP(Subgraph, ngraph::snippets::op)
|
||||
NGRAPH_OP(VectorBuffer, ngraph::snippets::op)
|
||||
NGRAPH_OP_X64(LoadConvertSaturation, ov::intel_cpu)
|
||||
NGRAPH_OP_X64(LoadConvertTruncation, ov::intel_cpu)
|
||||
NGRAPH_OP_X64(StoreConvertSaturation, ov::intel_cpu)
|
||||
NGRAPH_OP_X64(StoreConvertTruncation, ov::intel_cpu)
|
||||
NGRAPH_OP_X64(BrgemmCPU, ov::intel_cpu)
|
||||
NGRAPH_OP_X64(BrgemmCopyB, ov::intel_cpu)
|
||||
#undef NGRAPH_OP
|
||||
|
||||
return opset;
|
||||
|
@ -988,21 +988,7 @@ void GraphOptimizer::FuseConvolutionAndSimpleOperationThroughMaxPool(Graph &grap
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!one_of(fuseCandidate->getAlgorithm(), Algorithm::EltwiseRelu,
|
||||
Algorithm::EltwiseGelu,
|
||||
Algorithm::EltwiseElu,
|
||||
Algorithm::EltwiseSigmoid,
|
||||
Algorithm::EltwiseClamp,
|
||||
Algorithm::EltwiseTanh,
|
||||
Algorithm::EltwiseSwish,
|
||||
Algorithm::EltwiseHswish,
|
||||
Algorithm::EltwiseMish,
|
||||
Algorithm::EltwiseHsigmoid,
|
||||
Algorithm::EltwiseRoundHalfToEven,
|
||||
Algorithm::EltwiseRoundHalfAwayFromZero,
|
||||
Algorithm::EltwiseAbs,
|
||||
Algorithm::EltwiseSqrt,
|
||||
Algorithm::EltwiseSoftRelu)) {
|
||||
if (!DnnlExtensionUtils::isUnarySupportedAsPostOp(fuseCandidate->getAlgorithm())) {
|
||||
parent++;
|
||||
continue;
|
||||
}
|
||||
@ -1175,17 +1161,7 @@ void GraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(Graph &graph)
|
||||
|
||||
auto isFusingSupported = [&](NodePtr conv, NodePtr child) {
|
||||
return child->getType() == Type::Eltwise &&
|
||||
one_of(child->getAlgorithm(), Algorithm::EltwiseRelu,
|
||||
Algorithm::EltwiseElu,
|
||||
Algorithm::EltwiseSigmoid,
|
||||
Algorithm::EltwiseClamp,
|
||||
Algorithm::EltwiseSwish,
|
||||
Algorithm::EltwiseHswish,
|
||||
Algorithm::EltwiseMish,
|
||||
Algorithm::EltwiseHsigmoid,
|
||||
Algorithm::EltwiseRoundHalfToEven,
|
||||
Algorithm::EltwiseRoundHalfAwayFromZero,
|
||||
Algorithm::EltwiseSoftRelu);
|
||||
DnnlExtensionUtils::isUnarySupportedAsPostOp(child->getAlgorithm());
|
||||
};
|
||||
|
||||
for (auto &graphNode : graphNodes) {
|
||||
|
@ -1,56 +0,0 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <ngraph/pass/constant_folding.hpp>
|
||||
#include "fc_bias_fusion.hpp"
|
||||
#include "ngraph/op/fake_quantize.hpp"
|
||||
#include "ngraph/pass/manager.hpp"
|
||||
#include "reshape_fc_fusion.hpp"
|
||||
#include "align_matmul_input_ranks.hpp"
|
||||
#include "transformations/common_optimizations/reshape_prelu.hpp"
|
||||
#include "convert_broadcast_to_tiles.hpp"
|
||||
#include "convert_tile_to_seq_tiles.hpp"
|
||||
#include "convert_matmul_to_fc.hpp"
|
||||
#include "convert_to_power_static.hpp"
|
||||
#include "convert_to_leaky_relu.hpp"
|
||||
#include "convert_to_swish_cpu.hpp"
|
||||
#include "transformations/convert_precision.hpp"
|
||||
#include "transformations/utils/utils.hpp"
|
||||
#include "rnn_sequences_optimization.hpp"
|
||||
#include "transformations/common_optimizations/reshape_sequence_fusion.hpp"
|
||||
#include "ngram_fusion.hpp"
|
||||
|
||||
#include "itt.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
inline void ConvertToCPUSpecificOpset(std::shared_ptr<ngraph::Function> &nGraphFunc) {
|
||||
RUN_ON_FUNCTION_SCOPE(ConvertToCPUSpecificOpset);
|
||||
|
||||
ngraph::pass::Manager manager;
|
||||
manager.set_per_pass_validation(false);
|
||||
manager.register_pass<ConvertMatMulToFC>();
|
||||
manager.register_pass<AlignMatMulInputRanks>();
|
||||
manager.register_pass<ConvertTileToSeqTiles>();
|
||||
manager.register_pass<FullyConnectedBiasFusion>();
|
||||
manager.register_pass<ConvertToPowerStatic>();
|
||||
manager.register_pass<ConvertToLeakyRelu>();
|
||||
manager.register_pass<ConvertToSwishCPU>();
|
||||
manager.register_pass<OptimizeSequenceTransposes>();
|
||||
if (!ov::op::util::has_op_with_type<ngraph::op::FakeQuantize>(nGraphFunc)) {
|
||||
manager.register_pass<ReshapeFullyConnectedFusion>();
|
||||
}
|
||||
// after transformation "MoveEltwiseUpThroughDataMov" there can be Reshape sequences that should be eliminated or fused
|
||||
manager.register_pass<ov::pass::ReshapeSequenceFusion>();
|
||||
manager.register_pass<ngraph::pass::ConstantFolding>();
|
||||
manager.register_pass<ov::pass::ConvertPrecision>(precisions_map {{ ngraph::element::i64, ngraph::element::i32 }});
|
||||
manager.register_pass<NgramFusion>();
|
||||
manager.register_pass<ov::pass::Validate>();
|
||||
|
||||
manager.run_passes(nGraphFunc);
|
||||
}
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
@ -455,6 +455,7 @@ std::string Node::getPrimitiveDescriptorType() {
|
||||
|
||||
SEARCH_TYPE(winograd);
|
||||
SEARCH_TYPE(sparse);
|
||||
SEARCH_TYPE(acl);
|
||||
SEARCH_TYPE(_dw);
|
||||
SEARCH_TYPE(_1x1);
|
||||
|
||||
@ -959,6 +960,9 @@ void Node::cleanup() {
|
||||
const std::vector<impl_desc_type>& Node::getPrimitivesPriority() {
|
||||
std::vector<impl_desc_type> priorities = {
|
||||
impl_desc_type::unknown,
|
||||
// Undef impl type is used to express use-cases there real type is unkown during compilation
|
||||
// Undef has higher priority than defined types in order to force primitive selection logic to make decision based on other properties
|
||||
impl_desc_type::undef,
|
||||
impl_desc_type::brgconv_avx512_amx_1x1,
|
||||
impl_desc_type::brgconv_avx512_amx,
|
||||
impl_desc_type::jit_avx512_amx_dw,
|
||||
@ -988,6 +992,7 @@ const std::vector<impl_desc_type>& Node::getPrimitivesPriority() {
|
||||
impl_desc_type::gemm_avx2,
|
||||
impl_desc_type::gemm_avx,
|
||||
impl_desc_type::gemm_sse42,
|
||||
impl_desc_type::acl,
|
||||
impl_desc_type::jit_gemm,
|
||||
impl_desc_type::ref_any,
|
||||
impl_desc_type::ref,
|
||||
@ -1340,6 +1345,7 @@ Node* Node::NodesFactory::create(const std::shared_ptr<ngraph::Node>& op, const
|
||||
}
|
||||
|
||||
bool Node::canBePerformedAsScaleShift(const Node *parentNode) const {
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
IE_ASSERT(parentNode);
|
||||
|
||||
size_t fusingPort = 0;
|
||||
@ -1390,6 +1396,10 @@ bool Node::canBePerformedAsScaleShift(const Node *parentNode) const {
|
||||
Algorithm::EltwisePrelu,
|
||||
Algorithm::EltwiseMulAdd) && isBroadcastableToDataInput())
|
||||
|| isConvertablePowerStatic();
|
||||
#else
|
||||
// TODO: provide correct list of operations for other backends
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
// @todo shifts for Subtract and scales for Divide are replaced with
|
||||
@ -1606,22 +1616,7 @@ bool Node::canFuseSimpleOperation(const NodePtr& node) const {
|
||||
}
|
||||
return ret;
|
||||
} else if (node->getType() == Type::Eltwise) {
|
||||
return one_of(node->getAlgorithm(),
|
||||
Algorithm::EltwiseRelu,
|
||||
Algorithm::EltwiseGelu,
|
||||
Algorithm::EltwiseElu,
|
||||
Algorithm::EltwiseSigmoid,
|
||||
Algorithm::EltwiseClamp,
|
||||
Algorithm::EltwiseTanh,
|
||||
Algorithm::EltwiseSwish,
|
||||
Algorithm::EltwiseHswish,
|
||||
Algorithm::EltwiseMish,
|
||||
Algorithm::EltwiseHsigmoid,
|
||||
Algorithm::EltwiseRoundHalfToEven,
|
||||
Algorithm::EltwiseRoundHalfAwayFromZero,
|
||||
Algorithm::EltwiseAbs,
|
||||
Algorithm::EltwiseSqrt,
|
||||
Algorithm::EltwiseSoftRelu) ||
|
||||
return DnnlExtensionUtils::isUnarySupportedAsPostOp(node->getAlgorithm()) ||
|
||||
node->canBePerformedAsScaleShift(this);
|
||||
}
|
||||
return false;
|
||||
|
@ -37,6 +37,8 @@
|
||||
|
||||
#include "dnnl_postops_composer.h"
|
||||
#include "graph_context.h"
|
||||
#include "nodes/executors/mvn_list.hpp"
|
||||
#include "nodes/executors/executor.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
@ -75,6 +77,12 @@ class NodeDesc {
|
||||
public:
|
||||
NodeDesc(const NodeConfig& conf, impl_desc_type type): config(conf) {
|
||||
implementationType = type;
|
||||
executorFactory = nullptr;
|
||||
}
|
||||
|
||||
NodeDesc(const NodeConfig& conf, impl_desc_type type, ExecutorFactoryPtr factory): config(conf) {
|
||||
implementationType = type;
|
||||
executorFactory = factory;
|
||||
}
|
||||
|
||||
const NodeConfig& getConfig() const {
|
||||
@ -93,9 +101,28 @@ public:
|
||||
implementationType = type;
|
||||
}
|
||||
|
||||
ExecutorFactoryPtr getExecutorFactory() const {
|
||||
return executorFactory;
|
||||
}
|
||||
|
||||
template <typename T,
|
||||
typename std::enable_if<!std::is_pointer<T>::value && !std::is_reference<T>::value, int>::type = 0,
|
||||
typename std::enable_if<std::is_base_of<ExecutorFactory, T>::value, int>::type = 0>
|
||||
std::shared_ptr<T> getExecutorFactoryAs() {
|
||||
auto casted = std::dynamic_pointer_cast<T>(executorFactory);
|
||||
if (!casted)
|
||||
IE_THROW() << "Cannot dynamically cast ExecutorFactory";
|
||||
return casted;
|
||||
}
|
||||
|
||||
void setExecutorFactory(ExecutorFactoryPtr factory) {
|
||||
executorFactory = factory;
|
||||
}
|
||||
|
||||
private:
|
||||
NodeConfig config;
|
||||
impl_desc_type implementationType;
|
||||
ExecutorFactoryPtr executorFactory;
|
||||
};
|
||||
|
||||
class Node {
|
||||
|
@ -42,7 +42,7 @@ using namespace Xbyak;
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
namespace node {
|
||||
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
#define GET_OFF(field) offsetof(jit_bin_conv_call_args, field)
|
||||
|
||||
template <cpu_isa_t isa>
|
||||
@ -874,7 +874,7 @@ private:
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
bool BinaryConvolution::isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
|
||||
try {
|
||||
if (isDynamicNgraphNode(op)) {
|
||||
@ -1092,7 +1092,7 @@ void BinaryConvolution::createPrimitive() {
|
||||
IMPLICATION(jcp.kw > 7, (jcp.t_pad == 0 && jcp.l_pad == 0) || (jcp.stride_w == 1 && jcp.stride_h == 1));
|
||||
if (!args_ok)
|
||||
IE_THROW() << "BinaryConvolution with name '" << getName() << "' has unsupported parameters";
|
||||
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
if (implType == impl_desc_type::jit_avx512) {
|
||||
bin_conv_kernel.reset(new jit_uni_bin_conv_kernel_f32<x64::avx512_core>(jcp, jcp_dw_conv, *attr.get()));
|
||||
} else if (implType == impl_desc_type::jit_avx2) {
|
||||
@ -1102,6 +1102,7 @@ void BinaryConvolution::createPrimitive() {
|
||||
}
|
||||
if (bin_conv_kernel)
|
||||
bin_conv_kernel->create_ker();
|
||||
#endif
|
||||
}
|
||||
|
||||
bool BinaryConvolution::canFuse(const NodePtr& node) const {
|
||||
|
@ -10,7 +10,7 @@
|
||||
#include <openvino/op/i420_to_bgr.hpp>
|
||||
#include <openvino/core/type.hpp>
|
||||
#include <ie/ie_parallel.hpp>
|
||||
#include <utils/jit_kernel.hpp>
|
||||
#include "kernels/x64/jit_kernel.hpp"
|
||||
|
||||
using namespace InferenceEngine;
|
||||
using namespace dnnl::impl;
|
||||
@ -76,6 +76,7 @@ std::tuple<T, T, T> Converter::yuv_to_rgb(float y, float u, float v) {
|
||||
return std::make_tuple(r, g, b);
|
||||
}
|
||||
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
struct jit_uni_converter : public jit_kernel {
|
||||
DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_converter)
|
||||
|
||||
@ -264,6 +265,7 @@ void jit_uni_converter::store_tail(const variable<T*> & dst,
|
||||
|
||||
copy<T>(ptr[dst], s.pointer(), copy_size);
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace nv12 {
|
||||
|
||||
@ -394,6 +396,7 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
template<typename T>
|
||||
class JitConverter;
|
||||
|
||||
@ -611,7 +614,7 @@ public:
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
} // namespace nv12
|
||||
|
||||
namespace i420 {
|
||||
@ -748,6 +751,7 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
template<typename T>
|
||||
class JitConverter;
|
||||
|
||||
@ -964,13 +968,13 @@ public:
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
} // namespace i420
|
||||
|
||||
/**
|
||||
* Implements Color Convert shape inference algorithm. Depending on wether it has only single plain H dimension is
|
||||
* passed through or recalculated as 2/3 of the initial size.
|
||||
*
|
||||
*
|
||||
*/
|
||||
class ColorConvertShapeInfer : public ShapeInferEmptyPads {
|
||||
public:
|
||||
@ -1098,6 +1102,7 @@ void ColorConvert::initSupportedNV12Impls() {
|
||||
impls[Precision::FP32][false] = SUPPORTED_IMPL(TwoPlaneConvert, float, ref);
|
||||
}
|
||||
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
// jit_uni
|
||||
{
|
||||
auto &impls = _supportedImpls[impl_desc_type::jit_uni][algorithm];
|
||||
@ -1106,7 +1111,7 @@ void ColorConvert::initSupportedNV12Impls() {
|
||||
impls[Precision::FP32][true] = SUPPORTED_IMPL(SinglePlaneConvert, float, jit_uni);
|
||||
impls[Precision::FP32][false] = SUPPORTED_IMPL(TwoPlaneConvert, float, jit_uni);
|
||||
}
|
||||
|
||||
#endif
|
||||
#undef SUPPORTED_IMPL
|
||||
}
|
||||
|
||||
@ -1125,6 +1130,7 @@ void ColorConvert::initSupportedI420Impls() {
|
||||
impls[Precision::FP32][false] = SUPPORTED_IMPL(ThreePlaneConvert, float, ref);
|
||||
}
|
||||
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
// jit_uni
|
||||
{
|
||||
auto &impls = _supportedImpls[impl_desc_type::jit_uni][algorithm];
|
||||
@ -1133,7 +1139,7 @@ void ColorConvert::initSupportedI420Impls() {
|
||||
impls[Precision::FP32][true] = SUPPORTED_IMPL(SinglePlaneConvert, float, jit_uni);
|
||||
impls[Precision::FP32][false] = SUPPORTED_IMPL(ThreePlaneConvert, float, jit_uni);
|
||||
}
|
||||
|
||||
#endif
|
||||
#undef SUPPORTED_IMPL
|
||||
}
|
||||
|
||||
|
@ -7,25 +7,31 @@
|
||||
#include <ie_parallel.hpp>
|
||||
#include <utils/bfloat16.hpp>
|
||||
#include <utils/general_utils.h>
|
||||
#include <utils/jit_kernel.hpp>
|
||||
#include <selective_build.h>
|
||||
#include <openvino/core/type/float16.hpp>
|
||||
#include <cpu/x64/jit_generator.hpp>
|
||||
#include <algorithm>
|
||||
#include <type_traits>
|
||||
#include <tuple>
|
||||
#include <cmath>
|
||||
#include <onednn/dnnl.h>
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
#include "nodes/kernels/x64/jit_kernel.hpp"
|
||||
#include <cpu/x64/jit_generator.hpp>
|
||||
#endif
|
||||
|
||||
using namespace InferenceEngine;
|
||||
using namespace dnnl::impl::utils;
|
||||
using namespace dnnl::impl::cpu::x64;
|
||||
using namespace Xbyak;
|
||||
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
namespace {
|
||||
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
|
||||
using namespace dnnl::impl::utils;
|
||||
using namespace dnnl::impl::cpu::x64;
|
||||
using namespace Xbyak;
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
void convert_vec(jit_generator & gen,
|
||||
const RegExp & src,
|
||||
@ -156,6 +162,8 @@ void jit_convert(const TI* arg, TO* out, size_t count) {
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
template <Precision::ePrecision p>
|
||||
struct PrecisionInfo {
|
||||
using value_type = typename PrecisionTrait<p>::value_type;
|
||||
@ -356,6 +364,7 @@ struct ConvertPrecision<std::tuple<ov::intel_cpu::bfloat16_t, float>> {
|
||||
}
|
||||
};
|
||||
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
template<typename src_t>
|
||||
struct ConvertPrecision<std::tuple<src_t, ov::float16>> {
|
||||
void operator()(ConvertContext & ctx) {
|
||||
@ -462,6 +471,7 @@ struct ConvertPrecision<std::tuple<ov::float16, ov::float16>> {
|
||||
ctx.converted = true;
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
} // namespace
|
||||
|
||||
|
@ -26,6 +26,8 @@ using namespace Xbyak;
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
|
||||
template <cpu_isa_t isa>
|
||||
struct jit_uni_permute_kernel_f32 : public jit_uni_permute_kernel, public jit_generator {
|
||||
DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_permute_kernel_f32)
|
||||
@ -141,6 +143,8 @@ private:
|
||||
Xbyak::Xmm xmm = Xbyak::Xmm(1);
|
||||
};
|
||||
|
||||
#endif // OPENVINO_ARCH_X86_64
|
||||
|
||||
PermuteKernel::PermuteKernel(const PermuteParams& params) : params(params) {
|
||||
prepareParams();
|
||||
}
|
||||
@ -257,6 +261,7 @@ void PermuteKernel::prepareParams() {
|
||||
jcp.ndims = sorted_order.size();
|
||||
jcp.data_size = params.data_size;
|
||||
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
if (mayiuse(cpu::x64::avx512_core)) {
|
||||
permute_kernel.reset(new jit_uni_permute_kernel_f32<cpu::x64::avx512_core>(jcp));
|
||||
} else if (mayiuse(cpu::x64::avx2)) {
|
||||
@ -264,6 +269,7 @@ void PermuteKernel::prepareParams() {
|
||||
} else if (mayiuse(cpu::x64::sse41)) {
|
||||
permute_kernel.reset(new jit_uni_permute_kernel_f32<cpu::x64::sse41>(jcp));
|
||||
}
|
||||
#endif // OPENVINO_ARCH_X86_64
|
||||
|
||||
if (permute_kernel)
|
||||
permute_kernel->create_ker();
|
||||
|
@ -9,7 +9,7 @@
|
||||
#include <cpu/x64/injectors/jit_uni_eltwise_injector.hpp>
|
||||
#include <onednn/dnnl.h>
|
||||
#include "utils/bfloat16.hpp"
|
||||
#include "emitters/jit_bf16_emitters.hpp"
|
||||
#include "emitters/x64/jit_bf16_emitters.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
@ -50,7 +50,7 @@ struct jit_uni_softmax_kernel {
|
||||
|
||||
virtual void create_ker() = 0;
|
||||
};
|
||||
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
template <cpu_isa_t isa>
|
||||
struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_generator {
|
||||
DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_softmax_kernel_f32)
|
||||
@ -226,7 +226,7 @@ private:
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
SoftmaxGeneric::SoftmaxGeneric(Precision inpPrc, Precision outPrc)
|
||||
: input_prec(inpPrc), output_prec(outPrc) {
|
||||
if (Precision::BF16 == output_prec) {
|
||||
@ -236,6 +236,7 @@ SoftmaxGeneric::SoftmaxGeneric(Precision inpPrc, Precision outPrc)
|
||||
}
|
||||
|
||||
block_size = 1;
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
auto jcp = jit_softmax_config_params();
|
||||
jcp.src_dt = inpPrc;
|
||||
jcp.dst_dt = outPrc;
|
||||
@ -252,12 +253,14 @@ SoftmaxGeneric::SoftmaxGeneric(Precision inpPrc, Precision outPrc)
|
||||
}
|
||||
if (softmax_kernel)
|
||||
softmax_kernel->create_ker();
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename in_data_t, typename out_data_t>
|
||||
void SoftmaxGeneric::calculate(const in_data_t *src_data, out_data_t *dst_data, int B, int C, int H, int W) {
|
||||
for (int b = 0; b < B; b++) {
|
||||
int tail_start = 0;
|
||||
|
||||
if (softmax_kernel) {
|
||||
int blocks_num = H*W / block_size;
|
||||
|
||||
|
@ -327,6 +327,9 @@ InferenceEngine::Precision Convolution::fusedEltwisePrecision(const NodePtr& fus
|
||||
const std::vector<impl_desc_type>& Convolution::getPrimitivesPriority() {
|
||||
std::vector<impl_desc_type> priorities = {
|
||||
impl_desc_type::unknown,
|
||||
impl_desc_type::dw_acl,
|
||||
impl_desc_type::winograd_acl,
|
||||
impl_desc_type::gemm_acl,
|
||||
impl_desc_type::brgconv_avx512_amx_1x1,
|
||||
impl_desc_type::brgconv_avx512_amx,
|
||||
impl_desc_type::jit_avx512_amx_dw,
|
||||
@ -556,6 +559,7 @@ void Convolution::getSupportedDescriptors() {
|
||||
auto inputShape = getInputShapeAtPort(0);
|
||||
auto outputShape = getOutputShapeAtPort(0);
|
||||
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
bool acceptedFormat = inputDataType == memory::data_type::bf16;
|
||||
bool nspcAdded = false;
|
||||
acceptedFormat |= (shouldTryBrgconv && inputDataType == memory::data_type::f32);
|
||||
@ -594,6 +598,15 @@ void Convolution::getSupportedDescriptors() {
|
||||
out_candidate = std::make_shared<DnnlBlockedMemoryDesc>(outputShape, outputDataType, nspc);
|
||||
createDescriptor({ in_candidate }, { out_candidate });
|
||||
}
|
||||
#else
|
||||
(void)ncsp;
|
||||
(void)nCsp8c;
|
||||
(void)nCsp16c;
|
||||
|
||||
in_candidate = std::make_shared<DnnlBlockedMemoryDesc>(inputShape, inputDataType, nspc);
|
||||
out_candidate = std::make_shared<DnnlBlockedMemoryDesc>(outputShape, outputDataType, nspc);
|
||||
createDescriptor({ in_candidate }, { out_candidate });
|
||||
#endif
|
||||
}
|
||||
|
||||
void Convolution::setPostOps(dnnl::primitive_attr& attr,
|
||||
@ -899,7 +912,7 @@ void Convolution::createDescriptor(const std::vector<MemoryDescPtr>& inputDesc,
|
||||
|
||||
if (isWinograd())
|
||||
algorithms.push_back(dnnl::algorithm::convolution_winograd);
|
||||
algorithms.push_back(dnnl::algorithm::convolution_direct);
|
||||
algorithms.push_back(baseConvAlgorithm);
|
||||
|
||||
updatePadding();
|
||||
|
||||
@ -1367,7 +1380,8 @@ void Convolution::prepareParams() {
|
||||
getParentEdgeAt(1)->getParent()->isConstant()};
|
||||
|
||||
auto engine = getEngine();
|
||||
auto builder = [&engine](const ConvKey& key) -> executorPtr {
|
||||
auto convAlg = baseConvAlgorithm;
|
||||
auto builder = [&engine, convAlg](const ConvKey& key) -> executorPtr {
|
||||
// remove the requirement on weight memory layout to let primitive
|
||||
// report the best layout for weight to be reordered dynamically at runtime
|
||||
auto wghDescAny =
|
||||
@ -1405,7 +1419,7 @@ void Convolution::prepareParams() {
|
||||
attr);
|
||||
};
|
||||
|
||||
const auto alg = (key.implType & impl_desc_type::winograd) ? dnnl::algorithm::convolution_winograd : dnnl::algorithm::convolution_direct;
|
||||
const auto alg = (key.implType & impl_desc_type::winograd) ? dnnl::algorithm::convolution_winograd : convAlg;
|
||||
dnnl::primitive_desc desc = createDnnlConvDesc(engine,
|
||||
key.inp0->getDnnlDesc(),
|
||||
wghDescAny,
|
||||
@ -1419,6 +1433,7 @@ void Convolution::prepareParams() {
|
||||
key.attr);
|
||||
|
||||
auto itpd = desc;
|
||||
|
||||
executorPtr execPtr = nullptr;
|
||||
while (static_cast<bool>(itpd)) {
|
||||
impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());
|
||||
@ -1456,7 +1471,7 @@ void Convolution::prepareParams() {
|
||||
key.dilation,
|
||||
key.paddingL,
|
||||
key.paddingR,
|
||||
dnnl::algorithm::convolution_direct,
|
||||
convAlg,
|
||||
key.attr);
|
||||
|
||||
if (reorderConvDesc) {
|
||||
|
@ -171,6 +171,13 @@ private:
|
||||
MemoryPtr stockInputZeroPointsMemPtr;
|
||||
dnnl::memory::data_type outputDataType;
|
||||
InferenceEngine::Precision sumPrc = InferenceEngine::Precision::UNSPECIFIED;
|
||||
|
||||
// TODO: migrate on convolution_auto algorithm for x64
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
const dnnl::algorithm baseConvAlgorithm = dnnl::algorithm::convolution_direct;
|
||||
#else
|
||||
const dnnl::algorithm baseConvAlgorithm = dnnl::algorithm::convolution_auto;
|
||||
#endif
|
||||
};
|
||||
|
||||
} // namespace node
|
||||
|
@ -27,7 +27,7 @@ using namespace Xbyak;
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
namespace node {
|
||||
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
#define GET_OFF(field) offsetof(jit_def_conv_call_args, field)
|
||||
|
||||
template <cpu_isa_t isa>
|
||||
@ -671,7 +671,7 @@ private:
|
||||
pop(reg_sampled_offs);
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
bool DeformableConvolution::isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
|
||||
try {
|
||||
if (!one_of(op->get_type_info(),
|
||||
@ -1033,7 +1033,7 @@ DeformableConvolution::DefConvExecutor::DefConvExecutor(const DefConvAttr &defCo
|
||||
if (withModulation) {
|
||||
modStrides = descVector[MOD_ID]->getStrides();
|
||||
}
|
||||
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
const VectorDims srcDims = descVector[DATA_ID]->getShape().getStaticDims();
|
||||
const VectorDims weiDims = descVector[WEI_ID]->getShape().getStaticDims();
|
||||
const VectorDims dstDims = descVector[descVector.size() - 1]->getShape().getStaticDims();
|
||||
@ -1084,11 +1084,13 @@ DeformableConvolution::DefConvExecutor::DefConvExecutor(const DefConvAttr &defCo
|
||||
jcp.nb_oc_blocking = !mayiuse(cpu::x64::avx2) ? 2 : 4;
|
||||
|
||||
jcp.nthr = dnnl_get_max_threads();
|
||||
#endif
|
||||
}
|
||||
|
||||
DeformableConvolution::DefConvJitExecutor::DefConvJitExecutor(const DefConvAttr &defConvAttr,
|
||||
const std::vector<std::shared_ptr<BlockedMemoryDesc>> &descVector) :
|
||||
DefConvExecutor(defConvAttr, descVector) {
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
if (mayiuse(cpu::x64::avx512_core)) {
|
||||
def_conv_kernel.reset(new jit_uni_def_conv_kernel_f32<cpu::x64::avx512_core>(jcp));
|
||||
} else if (mayiuse(cpu::x64::avx2)) {
|
||||
@ -1103,6 +1105,7 @@ DeformableConvolution::DefConvJitExecutor::DefConvJitExecutor(const DefConvAttr
|
||||
} else {
|
||||
IE_THROW() << "Can't compile DefConvJitExecutor";
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void DeformableConvolution::DefConvRefExecutor::exec(const float* src, const float* offsets,
|
||||
|
@ -123,7 +123,7 @@ inline float getImaginaryFromComplexProd(float lhsReal, float lhsImag, float rhs
|
||||
|
||||
/*
|
||||
Returns true while we can iterate
|
||||
Specified axis is skipped in counters
|
||||
Specified axis is skipped in counters
|
||||
*/
|
||||
inline bool nextIterationStep(std::vector<size_t>& counters, const std::vector<size_t>& iterationRange, size_t axis) {
|
||||
auto itCounter = counters.rbegin();
|
||||
@ -535,7 +535,6 @@ void DFT::prepareParams() {
|
||||
hasFFT = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (mayiuse(cpu::x64::sse41)) {
|
||||
createJITKernels(hasDFT, hasFFT);
|
||||
}
|
||||
@ -553,8 +552,8 @@ std::vector<int32_t> DFT::getAxes() const {
|
||||
std::sort(axes.begin(), axes.end());
|
||||
return axes;
|
||||
}
|
||||
|
||||
void DFT::createJITKernels(bool hasDFT, bool hasFFT) {
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
if (hasDFT && dftKernel == nullptr) {
|
||||
if (mayiuse(cpu::x64::avx512_core)) {
|
||||
dftKernel.reset(new jit_uni_dft_kernel_f32<cpu::x64::avx512_core>());
|
||||
@ -584,8 +583,8 @@ void DFT::createJITKernels(bool hasDFT, bool hasFFT) {
|
||||
if (fftKernel)
|
||||
fftKernel->create_ker();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace node
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
||||
|
@ -8,7 +8,7 @@
|
||||
#include <node.h>
|
||||
#include <string>
|
||||
|
||||
#include "kernels/dft_uni_kernel.hpp"
|
||||
#include "kernels/x64/dft_uni_kernel.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
@ -31,7 +31,6 @@ public:
|
||||
private:
|
||||
std::vector<int32_t> getAxes() const;
|
||||
void createJITKernels(bool hasDFT, bool hasFFT);
|
||||
|
||||
void dftNd(float* output,
|
||||
const VectorDims& outputShape,
|
||||
const VectorDims& outputStrides,
|
||||
|
@ -23,10 +23,10 @@
|
||||
#include "input.h"
|
||||
#include "common/cpu_convert.h"
|
||||
|
||||
#include "emitters/jit_emitter.hpp"
|
||||
#include "emitters/jit_eltwise_emitters.hpp"
|
||||
#include "emitters/jit_dnnl_emitters.hpp"
|
||||
#include "emitters/jit_bf16_emitters.hpp"
|
||||
#include "emitters/x64/jit_emitter.hpp"
|
||||
#include "emitters/x64/jit_eltwise_emitters.hpp"
|
||||
#include "emitters/x64/jit_dnnl_emitters.hpp"
|
||||
#include "emitters/x64/jit_bf16_emitters.hpp"
|
||||
#include <selective_build.h>
|
||||
#include "utils/general_utils.h"
|
||||
#include "utils/cpu_utils.hpp"
|
||||
@ -34,9 +34,9 @@
|
||||
|
||||
#include "ngraph/ngraph.hpp"
|
||||
#include <ngraph/opsets/opset1.hpp>
|
||||
#include "ngraph_transformations/op/power_static.hpp"
|
||||
#include "ngraph_transformations/op/leaky_relu.hpp"
|
||||
#include "ngraph_transformations/op/swish_cpu.hpp"
|
||||
#include "transformations/cpu_opset/common/op/power_static.hpp"
|
||||
#include "transformations/cpu_opset/common/op/leaky_relu.hpp"
|
||||
#include "transformations/cpu_opset/common/op/swish_cpu.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
@ -58,7 +58,8 @@ using namespace Xbyak;
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
namespace node {
|
||||
namespace {
|
||||
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
|
||||
template<typename T>
|
||||
struct SupportedPrecisions {
|
||||
@ -106,61 +107,7 @@ struct EltwiseEmitter<jit_is_inf_emitter> {
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Implements Eltwise shape inference algorithm. The algorithm is based on broadcasting all the input shapes
|
||||
* according to the NUMPY broadcast rule. This implementation is more lightweight than the ngraph one.
|
||||
*
|
||||
*/
|
||||
class EltwiseShapeInfer : public ShapeInferEmptyPads {
|
||||
public:
|
||||
Result infer(
|
||||
const std::vector<std::reference_wrapper<const VectorDims>>& input_shapes,
|
||||
const std::unordered_map<size_t, MemoryPtr>& data_dependency) override {
|
||||
size_t max_rank = 0;
|
||||
size_t max_rank_idx = 0;
|
||||
for (size_t i = 0; i < input_shapes.size(); ++i) {
|
||||
auto item_rank = input_shapes[i].get().size();
|
||||
if (item_rank > max_rank) {
|
||||
max_rank = item_rank;
|
||||
max_rank_idx = i;
|
||||
}
|
||||
}
|
||||
auto output_shape = input_shapes[max_rank_idx].get();
|
||||
// use NUMPY broadcast rule
|
||||
for (size_t i = 0; i < input_shapes.size(); i++) {
|
||||
if (i == max_rank_idx)
|
||||
continue;
|
||||
|
||||
auto& input_shape = input_shapes[i].get();
|
||||
if (input_shape.size() > output_shape.size()) {
|
||||
IE_THROW() << "Eltwise shape infer input and output shapes rank mismatch";
|
||||
}
|
||||
size_t offset = output_shape.size() - input_shape.size();
|
||||
for (size_t j = 0; j < input_shape.size(); ++j) {
|
||||
if (input_shape[j] != output_shape[offset + j]) {
|
||||
if (output_shape[offset + j] == 1) {
|
||||
output_shape[offset + j] = input_shape[j];
|
||||
} else {
|
||||
if (input_shape[j] != 1) IE_THROW() << "Eltwise shape infer input shapes dim index: " << j << " mismatch";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return { { std::move(output_shape) }, ShapeInferStatus::success };
|
||||
}
|
||||
port_mask_t get_port_mask() const override {
|
||||
return EMPTY_PORT_MASK;
|
||||
}
|
||||
};
|
||||
|
||||
class EltwiseShapeInferFactory : public ShapeInferFactory {
|
||||
public:
|
||||
ShapeInferPtr makeShapeInfer() const override {
|
||||
return std::make_shared<EltwiseShapeInfer>();
|
||||
}
|
||||
};
|
||||
|
||||
void set_intersection(const std::set<std::vector<element::Type>>& precisions1,
|
||||
static void set_intersection(const std::set<std::vector<element::Type>>& precisions1,
|
||||
const std::set<std::vector<element::Type>>& precisions2,
|
||||
std::set<std::vector<element::Type>>& intersection) {
|
||||
std::map<element::Type, size_t> intersection_types;
|
||||
@ -181,9 +128,6 @@ void set_intersection(const std::set<std::vector<element::Type>>& precisions1,
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
|
||||
InferenceEngine::Precision eltwise_precision_helper::get_precision(const size_t inputs_number,
|
||||
const InferenceEngine::Precision(&src_prc)[MAX_ELTWISE_INPUTS],
|
||||
const std::vector<Eltwise::EltwiseData>& eltwise_data) {
|
||||
@ -261,7 +205,8 @@ std::set<std::vector<element::Type>> eltwise_precision_helper::get_supported_pre
|
||||
|
||||
OV_SWITCH(intel_cpu, SupportedPrecisions, precisions, algo,
|
||||
OV_CASE(Algorithm::EltwiseRelu, jit_dnnl_aux_emitter),
|
||||
OV_CASE(Algorithm::EltwiseGelu, jit_dnnl_aux_emitter),
|
||||
OV_CASE(Algorithm::EltwiseGeluErf, jit_dnnl_aux_emitter),
|
||||
OV_CASE(Algorithm::EltwiseGeluTanh, jit_dnnl_aux_emitter),
|
||||
OV_CASE(Algorithm::EltwiseElu, jit_dnnl_aux_emitter),
|
||||
OV_CASE(Algorithm::EltwiseTanh, jit_dnnl_aux_emitter),
|
||||
OV_CASE(Algorithm::EltwiseSigmoid, jit_dnnl_aux_emitter),
|
||||
@ -633,7 +578,8 @@ private:
|
||||
|
||||
OV_SWITCH(intel_cpu, EltwiseEmitter, ctx, data.algo,
|
||||
OV_CASE(Algorithm::EltwiseRelu, jit_dnnl_aux_emitter),
|
||||
OV_CASE(Algorithm::EltwiseGelu, jit_dnnl_aux_emitter),
|
||||
OV_CASE(Algorithm::EltwiseGeluErf, jit_dnnl_aux_emitter),
|
||||
OV_CASE(Algorithm::EltwiseGeluTanh, jit_dnnl_aux_emitter),
|
||||
OV_CASE(Algorithm::EltwiseElu, jit_dnnl_aux_emitter),
|
||||
OV_CASE(Algorithm::EltwiseTanh, jit_dnnl_aux_emitter),
|
||||
OV_CASE(Algorithm::EltwiseSigmoid, jit_dnnl_aux_emitter),
|
||||
@ -972,6 +918,66 @@ private:
|
||||
}
|
||||
};
|
||||
|
||||
#endif // OPENVINO_ARCH_X86_64
|
||||
|
||||
namespace {
|
||||
|
||||
/**
|
||||
* Implements Eltwise shape inference algorithm. The algorithm is based on broadcasting all the input shapes
|
||||
* according to the NUMPY broadcast rule. This implementation is more lightweight than the ngraph one.
|
||||
*
|
||||
*/
|
||||
class EltwiseShapeInfer : public ShapeInferEmptyPads {
|
||||
public:
|
||||
Result infer(
|
||||
const std::vector<std::reference_wrapper<const VectorDims>>& input_shapes,
|
||||
const std::unordered_map<size_t, MemoryPtr>& data_dependency) override {
|
||||
size_t max_rank = 0;
|
||||
size_t max_rank_idx = 0;
|
||||
for (size_t i = 0; i < input_shapes.size(); ++i) {
|
||||
auto item_rank = input_shapes[i].get().size();
|
||||
if (item_rank > max_rank) {
|
||||
max_rank = item_rank;
|
||||
max_rank_idx = i;
|
||||
}
|
||||
}
|
||||
auto output_shape = input_shapes[max_rank_idx].get();
|
||||
// use NUMPY broadcast rule
|
||||
for (size_t i = 0; i < input_shapes.size(); i++) {
|
||||
if (i == max_rank_idx)
|
||||
continue;
|
||||
|
||||
auto& input_shape = input_shapes[i].get();
|
||||
if (input_shape.size() > output_shape.size()) {
|
||||
IE_THROW() << "Eltwise shape infer input and output shapes rank mismatch";
|
||||
}
|
||||
size_t offset = output_shape.size() - input_shape.size();
|
||||
for (size_t j = 0; j < input_shape.size(); ++j) {
|
||||
if (input_shape[j] != output_shape[offset + j]) {
|
||||
if (output_shape[offset + j] == 1) {
|
||||
output_shape[offset + j] = input_shape[j];
|
||||
} else {
|
||||
if (input_shape[j] != 1) IE_THROW() << "Eltwise shape infer input shapes dim index: " << j << " mismatch";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return { { std::move(output_shape) }, ShapeInferStatus::success };
|
||||
}
|
||||
port_mask_t get_port_mask() const override {
|
||||
return EMPTY_PORT_MASK;
|
||||
}
|
||||
};
|
||||
|
||||
class EltwiseShapeInferFactory : public ShapeInferFactory {
|
||||
public:
|
||||
ShapeInferPtr makeShapeInfer() const override {
|
||||
return std::make_shared<EltwiseShapeInfer>();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
Eltwise::BroadcastingPolicy Eltwise::determineBroadcastingPolicy(const std::shared_ptr<ngraph::Node>& op) {
|
||||
const auto const1 = ov::as_type_ptr<ngraph::opset1::Constant>(op->get_input_node_shared_ptr(0));
|
||||
const auto const2 = ov::as_type_ptr<ngraph::opset1::Constant>(op->get_input_node_shared_ptr(1));
|
||||
@ -1088,23 +1094,24 @@ const std::map<const ngraph::DiscreteTypeInfo, Eltwise::Initializer> Eltwise::in
|
||||
node.beta = 0.0f;
|
||||
}},
|
||||
{ngraph::op::v0::Gelu::get_type_info_static(), [](const std::shared_ptr<ngraph::Node>& op, Eltwise& node) {
|
||||
node.algorithm = Algorithm::EltwiseGelu;
|
||||
node.algorithm = Algorithm::EltwiseGeluErf;
|
||||
node.onednnAlgorithm = dnnl::algorithm::eltwise_gelu_erf;
|
||||
}},
|
||||
{ngraph::op::v7::Gelu::get_type_info_static(), [](const std::shared_ptr<ngraph::Node>& op, Eltwise& node) {
|
||||
auto gelu = getNgraphOpAs<ngraph::op::v7::Gelu>(op);
|
||||
node.algorithm = Algorithm::EltwiseGelu;
|
||||
ngraph::op::GeluApproximationMode approximationMode = gelu->get_approximation_mode();
|
||||
if (approximationMode == ngraph::op::GeluApproximationMode::ERF)
|
||||
if (approximationMode == ngraph::op::GeluApproximationMode::ERF) {
|
||||
node.algorithm = Algorithm::EltwiseGeluErf;
|
||||
node.onednnAlgorithm = dnnl::algorithm::eltwise_gelu_erf;
|
||||
else if (approximationMode == ngraph::op::GeluApproximationMode::TANH)
|
||||
} else if (approximationMode == ngraph::op::GeluApproximationMode::TANH) {
|
||||
node.algorithm = Algorithm::EltwiseGeluTanh;
|
||||
node.onednnAlgorithm = dnnl::algorithm::eltwise_gelu_tanh;
|
||||
else
|
||||
} else {
|
||||
IE_THROW(NotImplemented) << "CPU Eltwise node doesn't support ngraph operation Gelu with approximation mode: " << approximationMode;
|
||||
}
|
||||
}},
|
||||
{ngraph::op::v0::Elu::get_type_info_static(), [](const std::shared_ptr<ngraph::Node>& op, Eltwise& node) {
|
||||
auto eluOp = getNgraphOpAs<ngraph::op::v0::Elu>(op);
|
||||
|
||||
node.alpha = static_cast<float>(eluOp->get_alpha());
|
||||
node.algorithm = Algorithm::EltwiseElu;
|
||||
node.onednnAlgorithm = dnnl::algorithm::eltwise_elu;
|
||||
@ -1197,6 +1204,9 @@ const std::map<const ngraph::DiscreteTypeInfo, Eltwise::Initializer> Eltwise::in
|
||||
{ngraph::op::v1::Select::get_type_info_static(), [](const std::shared_ptr<ngraph::Node>& op, Eltwise& node) {
|
||||
node.algorithm = Algorithm::EltwiseSelect;
|
||||
}},
|
||||
{ngraph::op::v0::Log::get_type_info_static(), [](const std::shared_ptr<ngraph::Node>& op, Eltwise& node) {
|
||||
node.algorithm = Algorithm::EltwiseLog;
|
||||
}},
|
||||
};
|
||||
|
||||
|
||||
@ -1503,6 +1513,7 @@ public:
|
||||
std::transform(jep.oc_offsets.begin(), jep.oc_offsets.end(), jep.oc_offsets.begin(),
|
||||
[](size_t& offset) { return offset * sizeof(float);});
|
||||
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
if (mayiuse(x64::avx512_core)) {
|
||||
_pKernel.reset(new jit_uni_eltwise_generic<x64::avx512_core>(jep, eltwise_data, ops_list, post_ops));
|
||||
} else if (mayiuse(x64::avx2)) {
|
||||
@ -1512,7 +1523,7 @@ public:
|
||||
} else {
|
||||
IE_THROW() << "Can't create jit eltwise kernel";
|
||||
}
|
||||
|
||||
#endif // OPENVINO_ARCH_X86_64
|
||||
if (_pKernel)
|
||||
_pKernel->create_ker();
|
||||
}
|
||||
@ -1629,6 +1640,15 @@ public:
|
||||
}
|
||||
|
||||
void exec(const jit_eltwise_call_args_ptrs &args_ptrs, const VectorDims &dims_out) override {
|
||||
if (_opData.algo == Algorithm::EltwiseLog) {
|
||||
const float* src_ptr_f = reinterpret_cast<const float*>(args_ptrs.src_ptr[0]);
|
||||
float* dst_ptr_f = reinterpret_cast<float*>(args_ptrs.dst_ptr);
|
||||
parallel_for(_fullWorkAmount, [&](size_t i) {
|
||||
dst_ptr_f[i] = logf(src_ptr_f[i]);
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
std::shared_ptr<ref_eltwise_scalar_fwd_t> ref_eltwise_injector = nullptr;
|
||||
if (_opData.onednnAlgorithm != dnnl::algorithm::undef) {
|
||||
ref_eltwise_injector = std::make_shared<ref_eltwise_scalar_fwd_t>(
|
||||
@ -1671,7 +1691,8 @@ public:
|
||||
|
||||
switch (_opData.algo) {
|
||||
case Algorithm::EltwiseRelu:
|
||||
case Algorithm::EltwiseGelu:
|
||||
case Algorithm::EltwiseGeluErf:
|
||||
case Algorithm::EltwiseGeluTanh:
|
||||
case Algorithm::EltwiseElu:
|
||||
case Algorithm::EltwiseTanh:
|
||||
case Algorithm::EltwiseSigmoid:
|
||||
@ -1816,7 +1837,8 @@ size_t Eltwise::getOpInputsNum() const {
|
||||
case Algorithm::EltwiseIsInf:
|
||||
case Algorithm::EltwiseIsNaN:
|
||||
case Algorithm::EltwiseRelu:
|
||||
case Algorithm::EltwiseGelu:
|
||||
case Algorithm::EltwiseGeluErf:
|
||||
case Algorithm::EltwiseGeluTanh:
|
||||
case Algorithm::EltwiseElu:
|
||||
case Algorithm::EltwiseTanh:
|
||||
case Algorithm::EltwiseSigmoid:
|
||||
@ -1835,6 +1857,7 @@ size_t Eltwise::getOpInputsNum() const {
|
||||
case Algorithm::EltwiseRoundHalfToEven:
|
||||
case Algorithm::EltwiseRoundHalfAwayFromZero:
|
||||
case Algorithm::EltwiseSoftSign:
|
||||
case Algorithm::EltwiseLog:
|
||||
return 1;
|
||||
case Algorithm::EltwiseAdd:
|
||||
case Algorithm::EltwiseSubtract:
|
||||
@ -1899,6 +1922,8 @@ void Eltwise::initSupportedPrimitiveDescriptors() {
|
||||
|
||||
// if dim rank is greater than the maximum possible, we should use the reference execution
|
||||
bool canUseOptimizedImpl = mayiuse(x64::sse41) && getInputShapeAtPort(0).getRank() <= MAX_ELTWISE_DIM_RANK;
|
||||
// TODO: Add EltwiseLog algorithm support for JIT implementation
|
||||
canUseOptimizedImpl &= !one_of(getAlgorithm(), Algorithm::EltwiseLog);
|
||||
bool canUseOptimizedShapeAgnosticImpl = isDynamicNode() && canUseOptimizedImpl;
|
||||
|
||||
if (!canUseOptimizedImpl && !fusedWith.empty()) {
|
||||
@ -1992,7 +2017,7 @@ void Eltwise::initSupportedPrimitiveDescriptors() {
|
||||
Blocked
|
||||
};
|
||||
|
||||
auto initDesc = [&] (LayoutType lt) -> NodeDesc {
|
||||
auto initDesc = [&] (LayoutType lt, bool useAclExecutor = false) -> NodeDesc {
|
||||
auto createMemoryDesc = [lt](const Shape &shape, Precision prc, size_t offset) -> std::shared_ptr<CpuBlockedMemoryDesc> {
|
||||
const auto &dims = shape.getDims();
|
||||
if (lt == ChannelsFirst && shape.getRank() != 1) {
|
||||
@ -2072,18 +2097,36 @@ void Eltwise::initSupportedPrimitiveDescriptors() {
|
||||
|
||||
config.outConfs.push_back(portConfig);
|
||||
|
||||
impl_desc_type impl_type;
|
||||
if (mayiuse(x64::avx512_core)) {
|
||||
impl_type = impl_desc_type::jit_avx512;
|
||||
} else if (mayiuse(x64::avx2)) {
|
||||
impl_type = impl_desc_type::jit_avx2;
|
||||
} else if (mayiuse(x64::sse41)) {
|
||||
impl_type = impl_desc_type::jit_sse42;
|
||||
} else {
|
||||
impl_type = impl_desc_type::ref;
|
||||
}
|
||||
if (useAclExecutor) {
|
||||
impl_desc_type impl_type = impl_desc_type::undef;
|
||||
|
||||
return {config, impl_type};
|
||||
std::vector<MemoryDescPtr> srcMemoryDescs;
|
||||
for (int i = 0; i < config.inConfs.size(); i++) {
|
||||
srcMemoryDescs.push_back(config.inConfs[i].getMemDesc());
|
||||
}
|
||||
std::vector<MemoryDescPtr> dstMemoryDescs;
|
||||
for (int i = 0; i < config.outConfs.size(); i++) {
|
||||
dstMemoryDescs.push_back(config.outConfs[i].getMemDesc());
|
||||
}
|
||||
|
||||
auto factory = std::make_shared<EltwiseExecutorFactory>(eltwiseAttrs, srcMemoryDescs, dstMemoryDescs,
|
||||
std::make_shared<ExecutorContext>(context, getPrimitivesPriority()));
|
||||
|
||||
return {config, impl_type, !factory->isEmpty() ? factory : nullptr};
|
||||
} else {
|
||||
impl_desc_type impl_type = impl_desc_type::ref;
|
||||
if (canUseOptimizedImpl) {
|
||||
if (mayiuse(x64::avx512_core)) {
|
||||
impl_type = impl_desc_type::jit_avx512;
|
||||
} else if (mayiuse(x64::avx2)) {
|
||||
impl_type = impl_desc_type::jit_avx2;
|
||||
} else if (mayiuse(x64::sse41)) {
|
||||
impl_type = impl_desc_type::jit_sse42;
|
||||
}
|
||||
}
|
||||
|
||||
return {config, impl_type};
|
||||
}
|
||||
};
|
||||
|
||||
bool isChannelsFirstApplicable = one_of(getOutputShapeAtPort(0).getRank(), 1u, 2u, 3u, 4u, 5u);
|
||||
@ -2105,14 +2148,31 @@ void Eltwise::initSupportedPrimitiveDescriptors() {
|
||||
isBlockedApplicable = isBlockedApplicable && inShape.getMinDims()[1] != Shape::UNDEFINED_DIM && inShape.getMinDims()[1] > 1;
|
||||
}
|
||||
|
||||
inputNum = getParentEdges().size();
|
||||
currentInBlkDims.resize(inputNum);
|
||||
|
||||
#if defined (OV_CPU_WITH_ACL)
|
||||
eltwiseAttrs = {algorithm, alpha, beta, gamma};
|
||||
if (isChannelsFirstApplicable) {
|
||||
auto channelFirstDesc = initDesc(ChannelsFirst, true);
|
||||
if (channelFirstDesc.getExecutorFactory())
|
||||
supportedPrimitiveDescriptors.emplace_back(channelFirstDesc);
|
||||
}
|
||||
|
||||
auto planarDesc = initDesc(Planar, true);
|
||||
if (planarDesc.getExecutorFactory())
|
||||
supportedPrimitiveDescriptors.emplace_back(planarDesc);
|
||||
|
||||
canUseAclExecutor = !supportedPrimitiveDescriptors.empty();
|
||||
if (canUseAclExecutor)
|
||||
return;
|
||||
#endif
|
||||
|
||||
if (isChannelsFirstApplicable)
|
||||
supportedPrimitiveDescriptors.emplace_back(initDesc(ChannelsFirst));
|
||||
if (isBlockedApplicable)
|
||||
supportedPrimitiveDescriptors.emplace_back(initDesc(Blocked));
|
||||
supportedPrimitiveDescriptors.emplace_back(initDesc(Planar));
|
||||
|
||||
inputNum = getParentEdges().size();
|
||||
currentInBlkDims.resize(inputNum);
|
||||
}
|
||||
|
||||
void Eltwise::createPrimitive() {
|
||||
@ -2141,6 +2201,21 @@ void Eltwise::createPrimitive() {
|
||||
}
|
||||
|
||||
void Eltwise::prepareParams() {
|
||||
if (canUseAclExecutor) {
|
||||
std::vector<MemoryDescPtr> srcMemoryDescs;
|
||||
for (int i = 0; i < getParentEdges().size(); i++) {
|
||||
srcMemoryDescs.push_back(getParentEdgeAt(i)->getMemoryPtr()->getDescPtr());
|
||||
}
|
||||
std::vector<MemoryDescPtr> dstMemoryDescs;
|
||||
dstMemoryDescs.push_back(getChildEdgeAt(0)->getMemoryPtr()->getDescPtr());
|
||||
|
||||
auto selectedPD = getSelectedPrimitiveDescriptor();
|
||||
aclExecPtr = selectedPD->getExecutorFactoryAs<EltwiseExecutorFactory>()->makeExecutor(eltwiseAttrs, srcMemoryDescs, dstMemoryDescs, {});
|
||||
selectedPD->setImplementationType(aclExecPtr->getImplType());
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
auto outBlockingDesc = getChildEdgeAt(0)->getMemory().GetDescWithType<BlockedMemoryDesc>();
|
||||
const auto &outOrder = outBlockingDesc->getOrder();
|
||||
const auto ¤tOutBlkDims = outBlockingDesc->getBlockDims();
|
||||
@ -2309,6 +2384,15 @@ void Eltwise::execute(dnnl::stream strm) {
|
||||
}
|
||||
|
||||
execPtr->exec(args_ptrs, dims_out);
|
||||
} else if (aclExecPtr) {
|
||||
std::vector<MemoryCPtr> srcMemory;
|
||||
for (int i = 0; i < getParentEdges().size(); i++) {
|
||||
srcMemory.push_back(getParentEdgeAt(i)->getMemoryPtr());
|
||||
}
|
||||
std::vector<MemoryPtr> dstMemory;
|
||||
dstMemory.push_back(getChildEdgeAt(0)->getMemoryPtr());
|
||||
|
||||
aclExecPtr->exec(srcMemory, dstMemory, fqDataPtrs.data());
|
||||
} else {
|
||||
IE_THROW() << "Can't execute eltwise node with name: " << getName() << ". Primitive isn't created";
|
||||
}
|
||||
@ -2594,6 +2678,9 @@ bool Eltwise::canFuse(const NodePtr& node) const {
|
||||
if (!mayiuse(x64::sse41) || getInputShapeAtPort(0).getRank() > MAX_ELTWISE_DIM_RANK)
|
||||
return false;
|
||||
|
||||
// TODO: EltwiseLog is supported only via reference executor
|
||||
if (getAlgorithm() == Algorithm::EltwiseLog || node->getAlgorithm() == Algorithm::EltwiseLog)
|
||||
return false;
|
||||
|
||||
bool isIntegerNode = isIntegerComputeSupported(this);
|
||||
if (isIntegerNode && node->getType() != Type::Eltwise)
|
||||
@ -2669,4 +2756,4 @@ InferenceEngine::Precision Eltwise::getRuntimePrecision() const {
|
||||
|
||||
} // namespace node
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
||||
} // namespace ov
|
@ -10,6 +10,7 @@
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <caseless.hpp>
|
||||
#include "executors/eltwise_list.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
@ -199,6 +200,10 @@ private:
|
||||
|
||||
void appendMemory(const std::vector<float> &data, MemoryPtr &memPtr, std::vector<MemoryPtr>& postOpsMem);
|
||||
void appendMemory(const std::vector<float> &data, MemoryPtr &memPtr, std::vector<const void*>& postOpsMem);
|
||||
|
||||
bool canUseAclExecutor = false;
|
||||
EltwiseAttrs eltwiseAttrs;
|
||||
std::shared_ptr<EltwiseExecutor> aclExecPtr = nullptr;
|
||||
};
|
||||
|
||||
class eltwise_precision_helper {
|
||||
@ -213,4 +218,4 @@ private:
|
||||
|
||||
} // namespace node
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
||||
} // namespace ov
|
390
src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp
Normal file
390
src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp
Normal file
@ -0,0 +1,390 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "acl_eltwise.hpp"
|
||||
#include "acl_utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
using namespace arm_compute;
|
||||
|
||||
inline VectorDims reshape_sizes(VectorDims dims) {
|
||||
const size_t MAX_NUM_SHAPE = arm_compute::MAX_DIMS;
|
||||
VectorDims result_dims(MAX_NUM_SHAPE - 1);
|
||||
if (dims.size() >= MAX_NUM_SHAPE) {
|
||||
for (int i = 0; i < MAX_NUM_SHAPE - 1; i++) {
|
||||
result_dims[i] = dims[i];
|
||||
}
|
||||
for (int i = MAX_NUM_SHAPE - 1; i < dims.size(); i++) {
|
||||
result_dims[MAX_NUM_SHAPE - 2] *= dims[i];
|
||||
}
|
||||
} else {
|
||||
result_dims = dims;
|
||||
}
|
||||
return result_dims;
|
||||
}
|
||||
|
||||
AclEltwiseExecutor::AclEltwiseExecutor(const ExecutorContext::CPtr context) : EltwiseExecutor(context) {}
|
||||
|
||||
bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vector<MemoryDescPtr> &srcDescs,
|
||||
const std::vector<MemoryDescPtr> &dstDescs,
|
||||
const std::vector<EltwisePostOp> &postOps) {
|
||||
if (!postOps.empty()) { return false; }
|
||||
aclEltwiseAttrs = eltwiseAttrs;
|
||||
|
||||
std::vector<arm_compute::TensorShape> srcVecDims(srcDescs.size()), dstVecDims(dstDescs.size());
|
||||
std::vector<arm_compute::DataLayout> srcDataLayout(srcDescs.size()), dstDataLayout(dstDescs.size());
|
||||
std::vector<arm_compute::TensorInfo> srcTensorsInfo(srcDescs.size()), dstTensorsInfo(dstDescs.size());
|
||||
srcTensors = std::vector<arm_compute::Tensor>(srcDescs.size());
|
||||
dstTensors = std::vector<arm_compute::Tensor>(dstDescs.size());
|
||||
|
||||
for (int i = 0; i < srcVecDims.size(); i++) {
|
||||
srcVecDims[i] = shapeCast(reshape_sizes(srcDescs[i]->getShape().getDims()));
|
||||
}
|
||||
for (int i = 0; i < dstVecDims.size(); i++) {
|
||||
dstVecDims[i] = shapeCast(reshape_sizes(dstDescs[i]->getShape().getDims()));
|
||||
}
|
||||
|
||||
for (int i = 0; i < srcDescs.size(); i++) {
|
||||
srcDataLayout[i] = getAclDataLayoutByMemoryDesc(srcDescs[i]);
|
||||
if (srcDataLayout[i] == arm_compute::DataLayout::UNKNOWN) { return false; }
|
||||
}
|
||||
for (int i = 0; i < dstDescs.size(); i++) {
|
||||
dstDataLayout[i] = getAclDataLayoutByMemoryDesc(dstDescs[i]);
|
||||
if (dstDataLayout[i] == arm_compute::DataLayout::UNKNOWN) { return false; }
|
||||
}
|
||||
|
||||
if (srcDescs.size() == 2 &&
|
||||
srcDescs[0]->hasLayoutType(LayoutType::nspc) && srcDescs[1]->hasLayoutType(LayoutType::nspc) &&
|
||||
srcDescs[0]->getShape().getDims() != srcDescs[1]->getShape().getDims()) {
|
||||
auto dim_size = srcDescs[0]->getShape().getDims().size();
|
||||
auto mover = [&dim_size](TensorShape &_shape) {
|
||||
if (dim_size == 5) { std::swap(_shape[2], _shape[3]); }
|
||||
std::swap(_shape[1], _shape[2]);
|
||||
std::swap(_shape[0], _shape[1]);
|
||||
};
|
||||
if (dim_size < 5) {
|
||||
srcDataLayout[0] = srcDataLayout[1] = dstDataLayout[0] = DataLayout::NCHW;
|
||||
} else {
|
||||
srcDataLayout[0] = srcDataLayout[1] = dstDataLayout[0] = DataLayout::NCDHW;
|
||||
}
|
||||
mover(srcVecDims[0]);
|
||||
mover(srcVecDims[1]);
|
||||
mover(dstVecDims[0]);
|
||||
}
|
||||
|
||||
for (int i = 0; i < srcVecDims.size(); i++) {
|
||||
srcTensorsInfo[i] = TensorInfo(srcVecDims[i], 1,
|
||||
precisionToAclDataType(srcDescs[i]->getPrecision()),
|
||||
srcDataLayout[i]);
|
||||
srcTensors[i].allocator()->init(srcTensorsInfo[i]);
|
||||
}
|
||||
|
||||
for (int i = 0; i < dstVecDims.size(); i++) {
|
||||
dstTensorsInfo[i] = TensorInfo(dstVecDims[i], 1,
|
||||
precisionToAclDataType(dstDescs[i]->getPrecision()),
|
||||
dstDataLayout[i]);
|
||||
dstTensors[i].allocator()->init(dstTensorsInfo[i]);
|
||||
}
|
||||
|
||||
switch (aclEltwiseAttrs.algorithm) {
|
||||
case Algorithm::EltwiseAdd:
|
||||
if (!NEArithmeticAddition::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ConvertPolicy::SATURATE))
|
||||
return false;
|
||||
exec_func = [this]{
|
||||
auto acl_op = std::make_unique<NEArithmeticAddition>();
|
||||
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ConvertPolicy::SATURATE);
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
case Algorithm::EltwiseMultiply:
|
||||
if (!NEPixelWiseMultiplication::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0],
|
||||
1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO))
|
||||
return false;
|
||||
exec_func = [this]{
|
||||
auto acl_op = std::make_unique<NEPixelWiseMultiplication>();
|
||||
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
case Algorithm::EltwiseSubtract:
|
||||
if (!NEArithmeticSubtraction::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ConvertPolicy::SATURATE))
|
||||
return false;
|
||||
exec_func = [this]{
|
||||
auto acl_op = std::make_unique<NEArithmeticSubtraction>();
|
||||
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ConvertPolicy::SATURATE);
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
case Algorithm::EltwiseDivide:
|
||||
if (!NEElementwiseDivision::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0]))
|
||||
return false;
|
||||
exec_func = [this]{
|
||||
auto acl_op = std::make_unique<NEElementwiseDivision>();
|
||||
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]);
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
case Algorithm::EltwiseMaximum:
|
||||
if (!NEElementwiseMax::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0]))
|
||||
return false;
|
||||
exec_func = [this]{
|
||||
auto acl_op = std::make_unique<NEElementwiseMax>();
|
||||
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]);
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
case Algorithm::EltwiseMinimum:
|
||||
if (!NEElementwiseMin::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0]))
|
||||
return false;
|
||||
exec_func = [this]{
|
||||
auto acl_op = std::make_unique<NEElementwiseMin>();
|
||||
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]);
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
case Algorithm::EltwiseSquaredDifference:
|
||||
if (!NEElementwiseSquaredDiff::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0]))
|
||||
return false;
|
||||
exec_func = [this]{
|
||||
auto acl_op = std::make_unique<NEElementwiseSquaredDiff>();
|
||||
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]);
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
case Algorithm::EltwisePowerDynamic:
|
||||
if (!NEElementwisePower::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0]))
|
||||
return false;
|
||||
exec_func = [this]{
|
||||
auto acl_op = std::make_unique<NEElementwisePower>();
|
||||
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]);
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
case Algorithm::EltwiseEqual:
|
||||
if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ComparisonOperation::Equal))
|
||||
return false;
|
||||
exec_func = [this]{
|
||||
auto acl_op = std::make_unique<NEElementwiseComparison>();
|
||||
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::Equal);
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
case Algorithm::EltwiseNotEqual:
|
||||
if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ComparisonOperation::NotEqual))
|
||||
return false;
|
||||
exec_func = [this]{
|
||||
auto acl_op = std::make_unique<NEElementwiseComparison>();
|
||||
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::NotEqual);
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
case Algorithm::EltwiseGreater:
|
||||
if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ComparisonOperation::Greater))
|
||||
return false;
|
||||
exec_func = [this]{
|
||||
auto acl_op = std::make_unique<NEElementwiseComparison>();
|
||||
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::Greater);
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
case Algorithm::EltwiseGreaterEqual:
|
||||
if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ComparisonOperation::GreaterEqual))
|
||||
return false;
|
||||
exec_func = [this]{
|
||||
auto acl_op = std::make_unique<NEElementwiseComparison>();
|
||||
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::GreaterEqual);
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
case Algorithm::EltwiseLess:
|
||||
if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ComparisonOperation::Less))
|
||||
return false;
|
||||
exec_func = [this]{
|
||||
auto acl_op = std::make_unique<NEElementwiseComparison>();
|
||||
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::Less);
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
case Algorithm::EltwiseLessEqual:
|
||||
if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ComparisonOperation::LessEqual))
|
||||
return false;
|
||||
exec_func = [this]{
|
||||
auto acl_op = std::make_unique<NEElementwiseComparison>();
|
||||
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::LessEqual);
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
case Algorithm::EltwiseRelu:
|
||||
if (aclEltwiseAttrs.alpha == 0) {
|
||||
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
|
||||
ActivationLayerInfo::ActivationFunction::RELU))
|
||||
return false;
|
||||
} else {
|
||||
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
|
||||
{ActivationLayerInfo::ActivationFunction::LEAKY_RELU, aclEltwiseAttrs.alpha}))
|
||||
return false;
|
||||
}
|
||||
exec_func = [this]{
|
||||
auto acl_op = std::make_unique<NEActivationLayer>();
|
||||
if (aclEltwiseAttrs.alpha == 0) {
|
||||
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::RELU);
|
||||
} else {
|
||||
acl_op->configure(&srcTensors[0], &dstTensors[0],
|
||||
{ActivationLayerInfo::ActivationFunction::LEAKY_RELU, aclEltwiseAttrs.alpha});
|
||||
}
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
case Algorithm::EltwiseGeluErf:
|
||||
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::GELU))
|
||||
return false;
|
||||
exec_func = [this]{
|
||||
auto acl_op = std::make_unique<NEActivationLayer>();
|
||||
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::GELU);
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
case Algorithm::EltwiseElu:
|
||||
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
|
||||
{ActivationLayerInfo::ActivationFunction::ELU, aclEltwiseAttrs.alpha}))
|
||||
return false;
|
||||
exec_func = [this]{
|
||||
auto acl_op = std::make_unique<NEActivationLayer>();
|
||||
acl_op->configure(&srcTensors[0], &dstTensors[0], {ActivationLayerInfo::ActivationFunction::ELU, aclEltwiseAttrs.alpha});
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
case Algorithm::EltwiseTanh:
|
||||
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
|
||||
{ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f}))
|
||||
return false;
|
||||
exec_func = [this]{
|
||||
auto acl_op = std::make_unique<NEActivationLayer>();
|
||||
acl_op->configure(&srcTensors[0], &dstTensors[0],
|
||||
{ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f});
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
case Algorithm::EltwiseSigmoid:
|
||||
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::LOGISTIC))
|
||||
return false;
|
||||
exec_func = [this]{
|
||||
auto acl_op = std::make_unique<NEActivationLayer>();
|
||||
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::LOGISTIC);
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
case Algorithm::EltwiseAbs:
|
||||
if (!NEAbsLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0]))
|
||||
return false;
|
||||
exec_func = [this]{
|
||||
auto acl_op = std::make_unique<NEAbsLayer>();
|
||||
acl_op->configure(&srcTensors[0], &dstTensors[0]);
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
case Algorithm::EltwiseSqrt:
|
||||
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::SQRT))
|
||||
return false;
|
||||
exec_func = [this]{
|
||||
auto acl_op = std::make_unique<NEActivationLayer>();
|
||||
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::SQRT);
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
case Algorithm::EltwiseSoftRelu:
|
||||
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::SOFT_RELU))
|
||||
return false;
|
||||
exec_func = [this]{
|
||||
auto acl_op = std::make_unique<NEActivationLayer>();
|
||||
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::SOFT_RELU);
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
case Algorithm::EltwiseExp:
|
||||
if (!NEExpLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0]))
|
||||
return false;
|
||||
exec_func = [this]{
|
||||
auto acl_op = std::make_unique<NEExpLayer>();
|
||||
acl_op->configure(&srcTensors[0], &dstTensors[0]);
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
case Algorithm::EltwiseClamp:
|
||||
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
|
||||
{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, aclEltwiseAttrs.beta, aclEltwiseAttrs.alpha}))
|
||||
return false;
|
||||
exec_func = [this]{
|
||||
auto acl_op = std::make_unique<NEActivationLayer>();
|
||||
acl_op->configure(&srcTensors[0], &dstTensors[0],
|
||||
{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, aclEltwiseAttrs.beta, aclEltwiseAttrs.alpha});
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
case Algorithm::EltwiseSwish:
|
||||
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
|
||||
{ActivationLayerInfo::ActivationFunction::SWISH, aclEltwiseAttrs.beta}))
|
||||
return false;
|
||||
exec_func = [this]{
|
||||
auto acl_op = std::make_unique<NEActivationLayer>();
|
||||
acl_op->configure(&srcTensors[0], &dstTensors[0],
|
||||
{ActivationLayerInfo::ActivationFunction::SWISH, aclEltwiseAttrs.alpha});
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
case Algorithm::EltwisePrelu:
|
||||
if (!NEPReluLayer::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0]))
|
||||
return false;
|
||||
exec_func = [this]{
|
||||
auto acl_op = std::make_unique<NEPReluLayer>();
|
||||
acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]);
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
case Algorithm::EltwiseHswish:
|
||||
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::HARD_SWISH))
|
||||
return false;
|
||||
exec_func = [this]{
|
||||
auto acl_op = std::make_unique<NEActivationLayer>();
|
||||
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::HARD_SWISH);
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
case Algorithm::EltwiseLog:
|
||||
if (!NELogLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0]))
|
||||
return false;
|
||||
exec_func = [this]{
|
||||
auto acl_op = std::make_unique<NELogLayer>();
|
||||
acl_op->configure(&srcTensors[0], &dstTensors[0]);
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
default:
|
||||
IE_THROW() << "Unsupported operation type for ACL Eltwise executor: " << static_cast<int>(aclEltwiseAttrs.algorithm);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void AclEltwiseExecutor::exec(const std::vector<MemoryCPtr> &src, const std::vector<MemoryPtr> &dst,
|
||||
const void *post_ops_data_) {
|
||||
for (int i = 0; i < src.size(); i++) {
|
||||
srcTensors[i].allocator()->import_memory(src[i]->GetPtr());
|
||||
}
|
||||
for (int i = 0; i < dst.size(); i++) {
|
||||
dstTensors[i].allocator()->import_memory(dst[i]->GetPtr());
|
||||
}
|
||||
|
||||
exec_func();
|
||||
|
||||
for (int i = 0; i < src.size(); i++) {
|
||||
srcTensors[i].allocator()->free();
|
||||
}
|
||||
for (int i = 0; i < dst.size(); i++) {
|
||||
dstTensors[i].allocator()->free();
|
||||
}
|
||||
}
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
110
src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.hpp
Normal file
110
src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.hpp
Normal file
@ -0,0 +1,110 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "../eltwise.hpp"
|
||||
#include "arm_compute/runtime/NEON/NEFunctions.h"
|
||||
#include "acl_utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
class AclEltwiseExecutor : public EltwiseExecutor {
|
||||
public:
|
||||
AclEltwiseExecutor(const ExecutorContext::CPtr context);
|
||||
|
||||
bool init(const EltwiseAttrs& eltwiseAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs,
|
||||
const std::vector<EltwisePostOp>& postOps) override;
|
||||
|
||||
void exec(const std::vector<MemoryCPtr>& src,
|
||||
const std::vector<MemoryPtr>& dst,
|
||||
const void *post_ops_data_) override;
|
||||
|
||||
impl_desc_type getImplType() const override {
|
||||
return implType;
|
||||
}
|
||||
private:
|
||||
EltwiseAttrs aclEltwiseAttrs{};
|
||||
impl_desc_type implType = impl_desc_type::acl;
|
||||
std::vector<arm_compute::Tensor> srcTensors, dstTensors;
|
||||
std::function<void()> exec_func;
|
||||
};
|
||||
|
||||
class AclEltwiseExecutorBuilder : public EltwiseExecutorBuilder {
|
||||
public:
|
||||
bool isSupported(const EltwiseAttrs& eltwiseAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs) const override {
|
||||
switch (eltwiseAttrs.algorithm) {
|
||||
case Algorithm::EltwiseAdd:
|
||||
case Algorithm::EltwiseMultiply:
|
||||
case Algorithm::EltwiseSubtract:
|
||||
case Algorithm::EltwiseDivide:
|
||||
case Algorithm::EltwiseMaximum:
|
||||
case Algorithm::EltwiseMinimum:
|
||||
case Algorithm::EltwiseSquaredDifference:
|
||||
case Algorithm::EltwisePowerDynamic:
|
||||
case Algorithm::EltwiseEqual:
|
||||
case Algorithm::EltwiseNotEqual:
|
||||
case Algorithm::EltwiseGreater:
|
||||
case Algorithm::EltwiseGreaterEqual:
|
||||
case Algorithm::EltwiseLess:
|
||||
case Algorithm::EltwiseLessEqual:
|
||||
case Algorithm::EltwiseRelu:
|
||||
case Algorithm::EltwiseGeluErf:
|
||||
case Algorithm::EltwiseElu:
|
||||
case Algorithm::EltwiseTanh:
|
||||
case Algorithm::EltwiseSigmoid:
|
||||
case Algorithm::EltwiseAbs:
|
||||
case Algorithm::EltwiseSqrt:
|
||||
case Algorithm::EltwiseSoftRelu:
|
||||
case Algorithm::EltwiseExp:
|
||||
case Algorithm::EltwiseClamp:
|
||||
case Algorithm::EltwiseSwish:
|
||||
case Algorithm::EltwisePrelu:
|
||||
case Algorithm::EltwiseHswish:
|
||||
case Algorithm::EltwiseLog:
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
// ACL supports only U8 precision on output for comparison operations
|
||||
if (one_of(eltwiseAttrs.algorithm, Algorithm::EltwiseEqual, Algorithm::EltwiseNotEqual, Algorithm::EltwiseGreater,
|
||||
Algorithm::EltwiseGreaterEqual, Algorithm::EltwiseLess, Algorithm::EltwiseLessEqual)) {
|
||||
if (dstDescs[0]->getPrecision() != InferenceEngine::Precision::U8) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
for (const auto &srcD : srcDescs) {
|
||||
for (const auto &dstD : dstDescs) {
|
||||
if ((srcD->getPrecision() != InferenceEngine::Precision::FP32 &&
|
||||
srcD->getPrecision() != InferenceEngine::Precision::FP16) ||
|
||||
srcD->getPrecision() != dstD->getPrecision())
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < srcDescs.size(); i++) {
|
||||
if (getAclDataLayoutByMemoryDesc(srcDescs[i]) == arm_compute::DataLayout::UNKNOWN)
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < dstDescs.size(); i++) {
|
||||
if (getAclDataLayoutByMemoryDesc(dstDescs[i]) == arm_compute::DataLayout::UNKNOWN)
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
EltwiseExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const override {
|
||||
return std::make_shared<AclEltwiseExecutor>(context);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
@ -0,0 +1,185 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "acl_interpolate.hpp"
|
||||
#include "acl_utils.hpp"
|
||||
|
||||
static arm_compute::TensorShape interpolateShapeCast(const ov::intel_cpu::VectorDims& dims) {
|
||||
arm_compute::TensorShape tensorShape;
|
||||
for (std::size_t i = 0; i < dims.size(); ++i) {
|
||||
tensorShape.set(dims.size() - i - 1, dims[i], false);
|
||||
}
|
||||
if (tensorShape.num_dimensions() == 0) {
|
||||
tensorShape.set(0, 1, false);
|
||||
tensorShape.set_num_dimensions(1);
|
||||
}
|
||||
return tensorShape;
|
||||
}
|
||||
|
||||
bool ov::intel_cpu::ACLInterpolateExecutor::init(const InterpolateAttrs &interpolateAttrs,
|
||||
const std::vector <MemoryDescPtr> &srcDescs,
|
||||
const std::vector <MemoryDescPtr> &dstDescs,
|
||||
const dnnl::primitive_attr &attr) {
|
||||
InterpolateExecutor::init(interpolateAttrs, srcDescs, dstDescs, attr);
|
||||
aclInterpolateAttrs = interpolateAttrs;
|
||||
auto& coord_mode = aclInterpolateAttrs.coordTransMode;
|
||||
auto& inter_mode = aclInterpolateAttrs.mode;
|
||||
acl_coord = arm_compute::SamplingPolicy::TOP_LEFT;
|
||||
auto& out_shape = dstDescs[0]->getShape().getDims();
|
||||
|
||||
if ((coord_mode == InterpolateCoordTransMode::pytorch_half_pixel && out_shape[2] > 1 && out_shape[3] > 1) ||
|
||||
coord_mode == InterpolateCoordTransMode::half_pixel) {
|
||||
acl_coord = arm_compute::SamplingPolicy::CENTER;
|
||||
}
|
||||
|
||||
switch (inter_mode) {
|
||||
case InterpolateMode::linear:
|
||||
case InterpolateMode::linear_onnx:
|
||||
acl_policy = arm_compute::InterpolationPolicy::BILINEAR;
|
||||
break;
|
||||
case InterpolateMode::nearest:
|
||||
acl_policy = arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR;
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
auto srcDims = srcDescs[0]->getShape().getStaticDims();
|
||||
auto dstDims = dstDescs[0]->getShape().getStaticDims();
|
||||
auto srcTensorInfo = arm_compute::TensorInfo(interpolateShapeCast(srcDims), 1,
|
||||
precisionToAclDataType(srcDescs[0]->getPrecision()),
|
||||
getAclDataLayoutByMemoryDesc(srcDescs[0]));
|
||||
auto dstTensorInfo = arm_compute::TensorInfo(interpolateShapeCast(dstDims), 1,
|
||||
precisionToAclDataType(dstDescs[0]->getPrecision()),
|
||||
getAclDataLayoutByMemoryDesc(dstDescs[0]));
|
||||
|
||||
if (!arm_compute::NEScale::validate(&srcTensorInfo,
|
||||
&dstTensorInfo,
|
||||
arm_compute::ScaleKernelInfo(acl_policy,
|
||||
arm_compute::BorderMode::REPLICATE,
|
||||
arm_compute::PixelValue(),
|
||||
acl_coord,
|
||||
false,
|
||||
coord_mode == InterpolateCoordTransMode::align_corners)))
|
||||
return false;
|
||||
|
||||
srcTensor.allocator()->init(srcTensorInfo);
|
||||
dstTensor.allocator()->init(dstTensorInfo);
|
||||
|
||||
acl_scale = std::make_unique<arm_compute::NEScale>();
|
||||
acl_scale->configure(&srcTensor, &dstTensor, arm_compute::ScaleKernelInfo(acl_policy,
|
||||
arm_compute::BorderMode::REPLICATE,
|
||||
arm_compute::PixelValue(),
|
||||
acl_coord,
|
||||
false,
|
||||
aclInterpolateAttrs.coordTransMode == InterpolateCoordTransMode::align_corners));
|
||||
return true;
|
||||
}
|
||||
|
||||
void ov::intel_cpu::ACLInterpolateExecutor::exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, const void *post_ops_data_) {
|
||||
auto in_ptr_ = padPreprocess(src, dst);
|
||||
srcTensor.allocator()->import_memory(const_cast<void *>(reinterpret_cast<const void *>(in_ptr_)));
|
||||
dstTensor.allocator()->import_memory(dst[0]->GetPtr());
|
||||
|
||||
acl_scale->run();
|
||||
|
||||
srcTensor.allocator()->free();
|
||||
dstTensor.allocator()->free();
|
||||
}
|
||||
|
||||
bool ov::intel_cpu::ACLInterpolateExecutorBuilder::isSupportedConfiguration(
|
||||
const ov::intel_cpu::InterpolateAttrs &interpolateAttrs, const std::vector<MemoryDescPtr> &srcDescs,
|
||||
const std::vector<MemoryDescPtr> &dstDescs) {
|
||||
auto& inp_shape = srcDescs[0]->getShape().getDims();
|
||||
auto& out_shape = dstDescs[0]->getShape().getDims();
|
||||
|
||||
float scale_h = static_cast<float>(out_shape[2]) / inp_shape[2];
|
||||
float scale_w = static_cast<float>(out_shape[3]) / inp_shape[3];
|
||||
bool is_upsample = scale_h > 1 && scale_w > 1;
|
||||
|
||||
auto& coord_mode = interpolateAttrs.coordTransMode;
|
||||
auto& nearest_mode = interpolateAttrs.nearestMode;
|
||||
|
||||
if (coord_mode == InterpolateCoordTransMode::asymmetric &&
|
||||
nearest_mode == InterpolateNearestMode::floor) {
|
||||
return is_upsample;
|
||||
}
|
||||
|
||||
if (coord_mode == InterpolateCoordTransMode::align_corners &&
|
||||
nearest_mode == InterpolateNearestMode::round_prefer_ceil) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (coord_mode == InterpolateCoordTransMode::half_pixel &&
|
||||
(nearest_mode == InterpolateNearestMode::simple || nearest_mode == InterpolateNearestMode::round_prefer_ceil)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (coord_mode == InterpolateCoordTransMode::asymmetric &&
|
||||
(nearest_mode == InterpolateNearestMode::simple || nearest_mode == InterpolateNearestMode::floor)) {
|
||||
return is_upsample;
|
||||
}
|
||||
|
||||
if (is_upsample) {
|
||||
bool int_factor = scale_h == static_cast<int>(scale_h) && scale_w == static_cast<int>(scale_w);
|
||||
if (int_factor && coord_mode != InterpolateCoordTransMode::asymmetric &&
|
||||
(nearest_mode == InterpolateNearestMode::round_prefer_ceil
|
||||
|| nearest_mode == InterpolateNearestMode::round_prefer_floor)) {
|
||||
return true;
|
||||
}
|
||||
} else if (scale_h < 1 && scale_w < 1) {
|
||||
float down_scale_h = static_cast<float>(inp_shape[2]) / out_shape[2];
|
||||
float down_scale_w = static_cast<float>(inp_shape[3]) / out_shape[3];
|
||||
bool int_factor = down_scale_h == static_cast<int>(down_scale_h) && down_scale_w == static_cast<int>(down_scale_w);
|
||||
|
||||
if (int_factor && coord_mode != InterpolateCoordTransMode::align_corners &&
|
||||
nearest_mode == InterpolateNearestMode::simple) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (int_factor && nearest_mode == InterpolateNearestMode::round_prefer_ceil &&
|
||||
((out_shape[2] > 1 && out_shape[3] > 1) || coord_mode != InterpolateCoordTransMode::half_pixel)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool ov::intel_cpu::ACLInterpolateExecutorBuilder::isSupported(const ov::intel_cpu::InterpolateAttrs &interpolateAttrs,
|
||||
const std::vector<MemoryDescPtr> &srcDescs,
|
||||
const std::vector<MemoryDescPtr> &dstDescs) const {
|
||||
if (srcDescs[0]->getShape().getDims().size() != 4) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto& pads_begin = interpolateAttrs.padBegin;
|
||||
auto& pads_end = interpolateAttrs.padEnd;
|
||||
|
||||
if (!std::all_of(pads_begin.begin(), pads_begin.end(), [](int i){return i == 0;}) ||
|
||||
!std::all_of(pads_end.begin(), pads_end.end(), [](int i){return i == 0;})) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto& nearest_mode = interpolateAttrs.nearestMode;
|
||||
auto& coord_mode = interpolateAttrs.coordTransMode;
|
||||
if (interpolateAttrs.antialias ||
|
||||
coord_mode == InterpolateCoordTransMode::tf_half_pixel_for_nn ||
|
||||
nearest_mode == InterpolateNearestMode::ceil) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (interpolateAttrs.mode == InterpolateMode::cubic) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (interpolateAttrs.mode == InterpolateMode::nearest &&
|
||||
!isSupportedConfiguration(interpolateAttrs, srcDescs, dstDescs)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (coord_mode == InterpolateCoordTransMode::pytorch_half_pixel) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
@ -0,0 +1,52 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "../interpolate.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
class ACLInterpolateExecutor : public InterpolateExecutor {
|
||||
public:
|
||||
ACLInterpolateExecutor(const ExecutorContext::CPtr context) : InterpolateExecutor(context) {}
|
||||
|
||||
bool init(const InterpolateAttrs& interpolateAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs,
|
||||
const dnnl::primitive_attr &attr) override;
|
||||
|
||||
void exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, const void *post_ops_data_) override;
|
||||
|
||||
impl_desc_type getImplType() const override {
|
||||
return implType;
|
||||
}
|
||||
|
||||
private:
|
||||
impl_desc_type implType = impl_desc_type::acl;
|
||||
InterpolateAttrs aclInterpolateAttrs;
|
||||
arm_compute::SamplingPolicy acl_coord;
|
||||
arm_compute::InterpolationPolicy acl_policy;
|
||||
bool antialias{};
|
||||
arm_compute::Tensor srcTensor, dstTensor;
|
||||
std::unique_ptr<arm_compute::NEScale> acl_scale;
|
||||
};
|
||||
|
||||
class ACLInterpolateExecutorBuilder : public InterpolateExecutorBuilder {
|
||||
public:
|
||||
bool isSupported(const InterpolateAttrs& interpolateAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs) const override;
|
||||
|
||||
InterpolateExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const override {
|
||||
return std::make_shared<ACLInterpolateExecutor>(context);
|
||||
}
|
||||
private:
|
||||
static bool isSupportedConfiguration(const InterpolateAttrs& interpolateAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs);
|
||||
};
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
76
src/plugins/intel_cpu/src/nodes/executors/acl/acl_mvn.cpp
Normal file
76
src/plugins/intel_cpu/src/nodes/executors/acl/acl_mvn.cpp
Normal file
@ -0,0 +1,76 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "acl_mvn.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
using namespace arm_compute;
|
||||
|
||||
AclMVNExecutor::AclMVNExecutor(const ExecutorContext::CPtr context) : MVNExecutor(context) {}
|
||||
|
||||
bool AclMVNExecutor::init(const MVNAttrs& mvnAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs,
|
||||
const dnnl::primitive_attr &attr) {
|
||||
auto srcDims = srcDescs[0]->getShape().getStaticDims();
|
||||
auto dstDims = dstDescs[0]->getShape().getStaticDims();
|
||||
|
||||
size_t X, Y;
|
||||
if (mvnAttrs.initAcrossChannels_) {
|
||||
if (srcDims.size() >= 2) {
|
||||
Y = srcDims[0];
|
||||
X = srcDims[1];
|
||||
for (int i = 2; i < srcDims.size(); i++) {
|
||||
X *= srcDims[i];
|
||||
}
|
||||
} else {
|
||||
Y = srcDims[0];
|
||||
X = 1;
|
||||
}
|
||||
} else {
|
||||
if (srcDims.size() > 2) {
|
||||
Y = srcDims[0] * srcDims[1];
|
||||
X = srcDims[2];
|
||||
for (int i = 3; i < srcDims.size(); i++) {
|
||||
X *= srcDims[i];
|
||||
}
|
||||
} else if (srcDims.size() == 2) {
|
||||
Y = srcDims[0] * srcDims[1];
|
||||
X = 1;
|
||||
} else {
|
||||
Y = srcDims[0];
|
||||
X = 1;
|
||||
}
|
||||
}
|
||||
|
||||
TensorInfo srcTensorInfo = TensorInfo(TensorShape(X, Y), 1, precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[0]));
|
||||
TensorInfo dstTensorInfo = TensorInfo(TensorShape(X, Y), 1, precisionToAclDataType(dstDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[0]));
|
||||
|
||||
|
||||
if (!arm_compute::NEMeanStdDevNormalizationLayer::validate(&srcTensorInfo, &dstTensorInfo, mvnAttrs.epsValue_))
|
||||
return false;
|
||||
|
||||
srcTensor.allocator()->init(srcTensorInfo);
|
||||
dstTensor.allocator()->init(dstTensorInfo);
|
||||
|
||||
mvn = std::make_unique<arm_compute::NEMeanStdDevNormalizationLayer>();
|
||||
mvn->configure(&srcTensor, &dstTensor, mvnAttrs.epsValue_);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void AclMVNExecutor::exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, const void *post_ops_data_) {
|
||||
srcTensor.allocator()->import_memory(src[0]->GetPtr());
|
||||
dstTensor.allocator()->import_memory(dst[0]->GetPtr());
|
||||
|
||||
mvn->run();
|
||||
|
||||
srcTensor.allocator()->free();
|
||||
dstTensor.allocator()->free();
|
||||
}
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
73
src/plugins/intel_cpu/src/nodes/executors/acl/acl_mvn.hpp
Normal file
73
src/plugins/intel_cpu/src/nodes/executors/acl/acl_mvn.hpp
Normal file
@ -0,0 +1,73 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "acl_utils.hpp"
|
||||
#include "nodes/executors/mvn.hpp"
|
||||
#include "arm_compute/runtime/NEON/NEFunctions.h"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
class AclMVNExecutor : public MVNExecutor {
|
||||
public:
|
||||
AclMVNExecutor(const ExecutorContext::CPtr context);
|
||||
|
||||
bool init(const MVNAttrs& mvnAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs,
|
||||
const dnnl::primitive_attr &attr) override;
|
||||
void exec(const std::vector<MemoryCPtr>& src,
|
||||
const std::vector<MemoryPtr>& dst,
|
||||
const void *post_ops_data_) override;
|
||||
|
||||
impl_desc_type getImplType() const override {
|
||||
return implType;
|
||||
}
|
||||
|
||||
private:
|
||||
impl_desc_type implType = impl_desc_type::acl;
|
||||
|
||||
arm_compute::Tensor srcTensor;
|
||||
arm_compute::Tensor dstTensor;
|
||||
std::unique_ptr<arm_compute::NEMeanStdDevNormalizationLayer> mvn = nullptr;
|
||||
};
|
||||
|
||||
class AclMVNExecutorBuilder : public MVNExecutorBuilder {
|
||||
public:
|
||||
bool isSupported(const MVNAttrs& mvnAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs) const override {
|
||||
if ((srcDescs[0]->getPrecision() != InferenceEngine::Precision::FP32 &&
|
||||
srcDescs[0]->getPrecision() != InferenceEngine::Precision::FP16) ||
|
||||
srcDescs[0]->getPrecision() != dstDescs[0]->getPrecision())
|
||||
return false;
|
||||
|
||||
if (!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) &&
|
||||
dstDescs[0]->hasLayoutType(LayoutType::ncsp)) &&
|
||||
!(srcDescs[0]->hasLayoutType(LayoutType::nspc) &&
|
||||
dstDescs[0]->hasLayoutType(LayoutType::nspc)))
|
||||
return false;
|
||||
|
||||
if (mvnAttrs.epsMode_ == MVNEpsMode::OUTSIDE_SQRT) {
|
||||
return false;
|
||||
}
|
||||
if (!mvnAttrs.normalizeVariance_) {
|
||||
return false;
|
||||
}
|
||||
if (!mvnAttrs.initAcrossChannels_ && getAclDataLayoutByMemoryDesc(srcDescs[0]) == arm_compute::DataLayout::NHWC) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
MVNExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const override {
|
||||
return std::make_shared<AclMVNExecutor>(context);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
183
src/plugins/intel_cpu/src/nodes/executors/acl/acl_pooling.cpp
Normal file
183
src/plugins/intel_cpu/src/nodes/executors/acl/acl_pooling.cpp
Normal file
@ -0,0 +1,183 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "acl_pooling.hpp"
|
||||
#include "acl_utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
using namespace arm_compute;
|
||||
|
||||
AclPoolingExecutor::AclPoolingExecutor(const ExecutorContext::CPtr context) : PoolingExecutor(context) {}
|
||||
|
||||
bool AclPoolingExecutor::isSupported(const TensorInfo& srcTensorInfo,
|
||||
const TensorInfo& dstTensorInfo,
|
||||
const PoolingAttrs& poolingAttrs,
|
||||
size_t srcDimsSize,
|
||||
size_t dstDescsSize,
|
||||
DataLayout dataLayout,
|
||||
const VectorDims* indDims,
|
||||
PoolingLayerInfo* pool_info,
|
||||
Pooling3dLayerInfo* pool3d_info) {
|
||||
unsigned int pad_left = (poolingAttrs.data_pad_begin.size() >= 2) ? poolingAttrs.data_pad_begin[1] : poolingAttrs.data_pad_begin[0];
|
||||
unsigned int pad_right = (poolingAttrs.data_pad_end.size() >= 2) ? poolingAttrs.data_pad_end[1] : poolingAttrs.data_pad_end[0];
|
||||
unsigned int pad_top = (poolingAttrs.data_pad_begin.size() >= 2) ? poolingAttrs.data_pad_begin[0] : 0;
|
||||
unsigned int pad_bottom = (poolingAttrs.data_pad_end.size() >= 2) ? poolingAttrs.data_pad_end[0] : 0;
|
||||
unsigned int kernel_w = (poolingAttrs.kernel.size() >= 2) ? poolingAttrs.kernel[1] : poolingAttrs.kernel[0];
|
||||
unsigned int kernel_h = (poolingAttrs.kernel.size() >= 2) ? poolingAttrs.kernel[0] : 1;
|
||||
unsigned int stride_x = (poolingAttrs.stride.size() >= 2) ? poolingAttrs.stride[1] : poolingAttrs.stride[0];
|
||||
unsigned int stride_y = (poolingAttrs.stride.size() >= 2) ? poolingAttrs.stride[0] : 1;
|
||||
|
||||
PoolingType pool_type;
|
||||
bool exclude_padding = false;
|
||||
if (poolingAttrs.algorithm == Algorithm::PoolingMax) {
|
||||
pool_type = PoolingType::MAX;
|
||||
exclude_padding = (poolingAttrs.pad_type != op::PadType::EXPLICIT);
|
||||
} else if (poolingAttrs.algorithm == Algorithm::PoolingAvg) {
|
||||
pool_type = PoolingType::AVG;
|
||||
exclude_padding = poolingAttrs.exclude_pad;
|
||||
} else {
|
||||
DEBUG_LOG("Unknown pooling algorithm: ", static_cast<int>(poolingAttrs.algorithm));
|
||||
return false;
|
||||
}
|
||||
DimensionRoundingType round = (poolingAttrs.rounding == op::RoundingType::CEIL) ?
|
||||
DimensionRoundingType::CEIL : DimensionRoundingType::FLOOR;
|
||||
|
||||
if (srcDimsSize == 5) {
|
||||
if (dstDescsSize > 1) {
|
||||
DEBUG_LOG("NEPooling3dLayer does not support indices");
|
||||
return false;
|
||||
} else {
|
||||
unsigned int kernel_d = poolingAttrs.kernel[2];
|
||||
unsigned int stride_z = poolingAttrs.stride[2];
|
||||
unsigned int pad_front = poolingAttrs.data_pad_begin[2];
|
||||
unsigned int pad_back = poolingAttrs.data_pad_end[2];
|
||||
pool3d_info->pool_type = pool_type;
|
||||
pool3d_info->exclude_padding = exclude_padding;
|
||||
pool3d_info->pool_size = arm_compute::Size3D(kernel_w, kernel_h, kernel_d);
|
||||
pool3d_info->stride = arm_compute::Size3D(stride_x, stride_y, stride_z);
|
||||
pool3d_info->padding = arm_compute::Padding3D(pad_left, pad_right, pad_top, pad_bottom, pad_front, pad_back);
|
||||
pool3d_info->round_type = round;
|
||||
arm_compute::Status s = arm_compute::NEPooling3dLayer::validate(&srcTensorInfo, &dstTensorInfo, *pool3d_info);
|
||||
if (!s) {
|
||||
DEBUG_LOG("NEPooling3dLayer validation failed: ", s.error_description());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
pool_info->data_layout = dataLayout;
|
||||
pool_info->pool_size = arm_compute::Size2D(kernel_w, kernel_h);
|
||||
pool_info->pad_stride_info = arm_compute::PadStrideInfo(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, round);
|
||||
pool_info->pool_type = pool_type;
|
||||
pool_info->exclude_padding = exclude_padding;
|
||||
if (dstDescsSize > 1) {
|
||||
TensorInfo indTensorInfo = TensorInfo(shapeCast(*indDims), 1, arm_compute::DataType::U32, dataLayout);
|
||||
arm_compute::Status s = arm_compute::NEPoolingLayer::validate(&srcTensorInfo, &dstTensorInfo, *pool_info, &indTensorInfo);
|
||||
if (!s) {
|
||||
DEBUG_LOG("NEPoolingLayer validation with indices failed: ", s.error_description());
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
arm_compute::Status s = arm_compute::NEPoolingLayer::validate(&srcTensorInfo, &dstTensorInfo, *pool_info);
|
||||
if (!s) {
|
||||
DEBUG_LOG("NEPoolingLayer validation without indices failed: ", s.error_description());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AclPoolingExecutor::init(const PoolingAttrs& poolingAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs,
|
||||
const dnnl::primitive_attr &attr) {
|
||||
auto srcDims = srcDescs[0]->getShape().getStaticDims();
|
||||
auto dstDims = dstDescs[0]->getShape().getStaticDims();
|
||||
|
||||
TensorInfo srcTensorInfo = TensorInfo(shapeCast(srcDims), 1,
|
||||
precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[0]));
|
||||
TensorInfo dstTensorInfo = TensorInfo(shapeCast(dstDims), 1,
|
||||
precisionToAclDataType(dstDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[0]));
|
||||
|
||||
srcTensor.allocator()->init(srcTensorInfo);
|
||||
dstTensor.allocator()->init(dstTensorInfo);
|
||||
|
||||
if (srcDims.size() == 5) {
|
||||
if (dstDescs.size() == 1) {
|
||||
Pooling3dLayerInfo pool_info;
|
||||
if (!isSupported(srcTensorInfo,
|
||||
dstTensorInfo,
|
||||
poolingAttrs,
|
||||
srcDims.size(),
|
||||
dstDescs.size(),
|
||||
getAclDataLayoutByMemoryDesc(srcDescs[0]),
|
||||
nullptr,
|
||||
nullptr,
|
||||
&pool_info))
|
||||
return false;
|
||||
exec_func = [this, pool_info]{
|
||||
auto acl_op = std::make_unique<arm_compute::NEPooling3dLayer>();
|
||||
acl_op->configure(&srcTensor, &dstTensor, pool_info);
|
||||
acl_op->run();
|
||||
};
|
||||
}
|
||||
} else {
|
||||
arm_compute::PoolingLayerInfo pool_info;
|
||||
if (dstDescs.size() > 1) {
|
||||
if (!isSupported(srcTensorInfo,
|
||||
dstTensorInfo,
|
||||
poolingAttrs,
|
||||
srcDims.size(),
|
||||
dstDescs.size(),
|
||||
getAclDataLayoutByMemoryDesc(srcDescs[0]),
|
||||
&dstDescs[1]->getShape().getStaticDims(),
|
||||
&pool_info,
|
||||
nullptr))
|
||||
return false;
|
||||
auto indDims = dstDescs[1]->getShape().getStaticDims();
|
||||
TensorInfo indTensorInfo = TensorInfo(shapeCast(indDims), 1, precisionToAclDataType(dstDescs[1]->getPrecision()),
|
||||
getAclDataLayoutByMemoryDesc(dstDescs[1]));
|
||||
indTensor.allocator()->init(indTensorInfo);
|
||||
exec_func = [this, pool_info]{
|
||||
auto acl_op = std::make_unique<arm_compute::NEPoolingLayer>();
|
||||
acl_op->configure(&srcTensor, &dstTensor, pool_info, &indTensor);
|
||||
acl_op->run();
|
||||
};
|
||||
} else {
|
||||
if (!isSupported(srcTensorInfo,
|
||||
dstTensorInfo,
|
||||
poolingAttrs,
|
||||
srcDims.size(),
|
||||
dstDescs.size(),
|
||||
getAclDataLayoutByMemoryDesc(srcDescs[0]),
|
||||
nullptr,
|
||||
&pool_info,
|
||||
nullptr))
|
||||
return false;
|
||||
exec_func = [this, pool_info]{
|
||||
auto acl_op = std::make_unique<arm_compute::NEPoolingLayer>();
|
||||
acl_op->configure(&srcTensor, &dstTensor, pool_info);
|
||||
acl_op->run();
|
||||
};
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void AclPoolingExecutor::exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, std::unordered_map<int, MemoryPtr> postOpsArgs) {
|
||||
srcTensor.allocator()->import_memory(src[0]->GetPtr());
|
||||
dstTensor.allocator()->import_memory(dst[0]->GetPtr());
|
||||
if (dst.size() > 1) indTensor.allocator()->import_memory(dst[1]->GetPtr());
|
||||
|
||||
exec_func();
|
||||
|
||||
srcTensor.allocator()->free();
|
||||
dstTensor.allocator()->free();
|
||||
if (dst.size() > 1) indTensor.allocator()->free();
|
||||
}
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
131
src/plugins/intel_cpu/src/nodes/executors/acl/acl_pooling.hpp
Normal file
131
src/plugins/intel_cpu/src/nodes/executors/acl/acl_pooling.hpp
Normal file
@ -0,0 +1,131 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "nodes/executors/pooling.hpp"
|
||||
#include "arm_compute/runtime/NEON/NEFunctions.h"
|
||||
#include "utils/debug_capabilities.h"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
class AclPoolingExecutor : public PoolingExecutor {
|
||||
public:
|
||||
AclPoolingExecutor(const ExecutorContext::CPtr context);
|
||||
|
||||
bool init(const PoolingAttrs& poolingAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs,
|
||||
const dnnl::primitive_attr &attr) override;
|
||||
void exec(const std::vector<MemoryCPtr>& src,
|
||||
const std::vector<MemoryPtr>& dst,
|
||||
std::unordered_map<int, MemoryPtr> postOpsArgs) override;
|
||||
|
||||
static bool isSupported(const arm_compute::TensorInfo& srcTensorInfo,
|
||||
const arm_compute::TensorInfo& dstTensorInfo,
|
||||
const PoolingAttrs& poolingAttrs,
|
||||
size_t srcDimsSize,
|
||||
size_t dstDescsSize,
|
||||
arm_compute::DataLayout dataLayout,
|
||||
const VectorDims* indDims,
|
||||
arm_compute::PoolingLayerInfo* pool_info,
|
||||
arm_compute::Pooling3dLayerInfo* pool3d_info);
|
||||
|
||||
impl_desc_type getImplType() const override {
|
||||
return implType;
|
||||
}
|
||||
|
||||
private:
|
||||
std::function<void()> exec_func;
|
||||
PoolingAttrs poolingAttrs;
|
||||
impl_desc_type implType = impl_desc_type::acl;
|
||||
|
||||
arm_compute::Tensor srcTensor;
|
||||
arm_compute::Tensor dstTensor;
|
||||
arm_compute::Tensor indTensor;
|
||||
std::unique_ptr<arm_compute::NEPoolingLayer> pooling = nullptr;
|
||||
};
|
||||
|
||||
class AclPoolingExecutorBuilder : public PoolingExecutorBuilder {
|
||||
public:
|
||||
bool isSupported(const PoolingAttrs& poolingAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs) const override {
|
||||
if ((srcDescs[0]->getPrecision() != InferenceEngine::Precision::FP32 &&
|
||||
dstDescs[0]->getPrecision() != InferenceEngine::Precision::FP32) &&
|
||||
(srcDescs[0]->getPrecision() != InferenceEngine::Precision::FP16 &&
|
||||
dstDescs[0]->getPrecision() != InferenceEngine::Precision::FP16)) {
|
||||
DEBUG_LOG("AclPoolingExecutor does not support precisions:",
|
||||
" src[0]=", srcDescs[0]->getPrecision(),
|
||||
" dst[0]=", dstDescs[0]->getPrecision());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (srcDescs.size() == 2 &&
|
||||
(srcDescs[1]->getPrecision() != InferenceEngine::Precision::FP32 &&
|
||||
srcDescs[0]->getPrecision() != InferenceEngine::Precision::FP32 &&
|
||||
dstDescs[0]->getPrecision() != InferenceEngine::Precision::FP32) &&
|
||||
(srcDescs[1]->getPrecision() != InferenceEngine::Precision::FP16 &&
|
||||
srcDescs[0]->getPrecision() != InferenceEngine::Precision::FP16 &&
|
||||
dstDescs[0]->getPrecision() != InferenceEngine::Precision::FP16)) {
|
||||
DEBUG_LOG("AclPoolingExecutor does not support precisions:",
|
||||
" src[0]=", srcDescs[0]->getPrecision(),
|
||||
" src[1]=", srcDescs[1]->getPrecision(),
|
||||
" dst[0]=", dstDescs[0]->getPrecision());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dstDescs.size() == 2 &&
|
||||
dstDescs[1]->getPrecision() != InferenceEngine::Precision::U32) {
|
||||
DEBUG_LOG("AclPoolingExecutor does not support precisions:",
|
||||
" dst[1]=", dstDescs[1]->getPrecision());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (srcDescs[0]->getShape().getRank() < 5) {
|
||||
if (!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) &&
|
||||
dstDescs[0]->hasLayoutType(LayoutType::ncsp)) &&
|
||||
!(srcDescs[0]->hasLayoutType(LayoutType::nspc) &&
|
||||
dstDescs[0]->hasLayoutType(LayoutType::nspc))) {
|
||||
DEBUG_LOG("NEPoolingLayer does not support layouts:",
|
||||
" src=", srcDescs[0]->serializeFormat(),
|
||||
" dst=", dstDescs[0]->serializeFormat());
|
||||
return false;
|
||||
}
|
||||
if (srcDescs.size() == 2 &&
|
||||
!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) &&
|
||||
srcDescs[1]->hasLayoutType(LayoutType::ncsp) &&
|
||||
dstDescs[0]->hasLayoutType(LayoutType::ncsp)) &&
|
||||
!(srcDescs[0]->hasLayoutType(LayoutType::nspc) &&
|
||||
srcDescs[1]->hasLayoutType(LayoutType::nspc) &&
|
||||
dstDescs[0]->hasLayoutType(LayoutType::nspc))) {
|
||||
DEBUG_LOG("NEPoolingLayer does not support layouts:",
|
||||
" src[0]=", srcDescs[0]->serializeFormat(),
|
||||
" src[1]=", srcDescs[1]->serializeFormat(),
|
||||
" dst=", dstDescs[0]->serializeFormat());
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (!(srcDescs[0]->hasLayoutType(LayoutType::nspc) &&
|
||||
dstDescs[0]->hasLayoutType(LayoutType::nspc)) &&
|
||||
!(srcDescs[0]->hasLayoutType(LayoutType::nspc) &&
|
||||
dstDescs[0]->hasLayoutType(LayoutType::nspc))) {
|
||||
DEBUG_LOG("Pooling3dLayer does not support layouts:",
|
||||
" src=", srcDescs[0]->serializeFormat(),
|
||||
" dst=", dstDescs[0]->serializeFormat());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
PoolingExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const override {
|
||||
return std::make_shared<AclPoolingExecutor>(context);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
109
src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.cpp
Normal file
109
src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.cpp
Normal file
@ -0,0 +1,109 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "acl_utils.hpp"
|
||||
#include "acl_reduce.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
using namespace arm_compute;
|
||||
|
||||
static arm_compute::ReductionOperation getAclReductionOperationByAlgorithm(Algorithm algorithm) {
|
||||
switch (algorithm) {
|
||||
case Algorithm::ReduceMax: return arm_compute::ReductionOperation::MAX;
|
||||
case Algorithm::ReduceMin: return arm_compute::ReductionOperation::MIN;
|
||||
case Algorithm::ReduceSum: return arm_compute::ReductionOperation::SUM;
|
||||
case Algorithm::ReduceProd: return arm_compute::ReductionOperation::PROD;
|
||||
default: IE_THROW() << "Unsupported reduction operation: " << static_cast<int>(algorithm);
|
||||
}
|
||||
}
|
||||
|
||||
AclReduceExecutor::AclReduceExecutor(const ExecutorContext::CPtr context) : ReduceExecutor(context) {}
|
||||
|
||||
bool AclReduceExecutor::init(const ReduceAttrs& reduceAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs,
|
||||
const dnnl::primitive_attr &attr) {
|
||||
if (reduceAttrs.operation != Algorithm::ReduceMax &&
|
||||
reduceAttrs.operation != Algorithm::ReduceMin &&
|
||||
reduceAttrs.operation != Algorithm::ReduceSum &&
|
||||
reduceAttrs.operation != Algorithm::ReduceProd &&
|
||||
reduceAttrs.operation != Algorithm::ReduceMean) {
|
||||
DEBUG_LOG("Unknown reduce algorithm passed into AclReduceExecutor: ", static_cast<int>(reduceAttrs.operation));
|
||||
return false;
|
||||
}
|
||||
|
||||
this->reduceAttrs = reduceAttrs;
|
||||
|
||||
auto srcDims = srcDescs[0]->getShape().getStaticDims();
|
||||
auto dstDims = dstDescs[0]->getShape().getStaticDims();
|
||||
|
||||
TensorInfo srcTensorInfo = TensorInfo(shapeCast(srcDims), 1,
|
||||
precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[0]));
|
||||
TensorInfo dstTensorInfo = TensorInfo(shapeCast(dstDims), 1,
|
||||
precisionToAclDataType(dstDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[0]));
|
||||
|
||||
srcTensor.allocator()->init(srcTensorInfo);
|
||||
dstTensor.allocator()->init(dstTensorInfo);
|
||||
|
||||
switch (reduceAttrs.operation) {
|
||||
case Algorithm::ReduceMean: {
|
||||
for (size_t i = 0; i < reduceAttrs.axes.size(); ++i) {
|
||||
auto axe = axisCast(reduceAttrs.axes[i], srcDims.size());
|
||||
auto pos = axisCast(i, reduceAttrs.axes.size());
|
||||
axesMean.set(pos, axe);
|
||||
}
|
||||
Status reduceMeanStatus = NEReduceMean::validate(&srcTensorInfo, axesMean, reduceAttrs.keepDims, &dstTensorInfo);
|
||||
if (!reduceMeanStatus) {
|
||||
DEBUG_LOG("NEReduceMean validation failed: ", reduceMeanStatus.error_description());
|
||||
return false;
|
||||
}
|
||||
exec_func = [this]{
|
||||
auto acl_op = std::make_unique<arm_compute::NEReduceMean>();
|
||||
acl_op->configure(&srcTensor, axesMean, this->reduceAttrs.keepDims, &dstTensor);
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
}
|
||||
case Algorithm::ReduceMax:
|
||||
case Algorithm::ReduceMin:
|
||||
case Algorithm::ReduceSum:
|
||||
case Algorithm::ReduceProd: {
|
||||
if (reduceAttrs.axes.size() != 1) {
|
||||
return false;
|
||||
}
|
||||
Status reductionOperationStatus = NEReductionOperation::validate(&srcTensorInfo, &dstTensorInfo, axisCast(reduceAttrs.axes[0], srcDims.size()),
|
||||
getAclReductionOperationByAlgorithm(reduceAttrs.operation), reduceAttrs.keepDims);
|
||||
if (!reductionOperationStatus) {
|
||||
DEBUG_LOG("NEReductionOperation validation with indices failed: ", reductionOperationStatus.error_description());
|
||||
return false;
|
||||
}
|
||||
exec_func = [this, srcDims]{
|
||||
auto acl_op = std::make_unique<arm_compute::NEReductionOperation>();
|
||||
acl_op->configure(&srcTensor, &dstTensor, axisCast(this->reduceAttrs.axes[0], srcDims.size()),
|
||||
getAclReductionOperationByAlgorithm(this->reduceAttrs.operation), this->reduceAttrs.keepDims);
|
||||
acl_op->run();
|
||||
};
|
||||
break;
|
||||
}
|
||||
default:
|
||||
IE_THROW() << "Unsupported operation type for ACL Reduce executor: " << static_cast<int>(reduceAttrs.operation);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void AclReduceExecutor::exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, const void *post_ops_data_) {
|
||||
srcTensor.allocator()->import_memory(src[0]->GetPtr());
|
||||
dstTensor.allocator()->import_memory(dst[0]->GetPtr());
|
||||
|
||||
exec_func();
|
||||
|
||||
srcTensor.allocator()->free();
|
||||
dstTensor.allocator()->free();
|
||||
}
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
75
src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.hpp
Normal file
75
src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.hpp
Normal file
@ -0,0 +1,75 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
// TODO: remove relative path
|
||||
#include "../reduce.hpp"
|
||||
#include "arm_compute/runtime/NEON/NEFunctions.h"
|
||||
#include "utils/debug_capabilities.h"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
class AclReduceExecutor : public ReduceExecutor {
|
||||
public:
|
||||
AclReduceExecutor(const ExecutorContext::CPtr context);
|
||||
|
||||
bool init(const ReduceAttrs& reduceAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs,
|
||||
const dnnl::primitive_attr &attr) override;
|
||||
void exec(const std::vector<MemoryCPtr>& src,
|
||||
const std::vector<MemoryPtr>& dst,
|
||||
const void *post_ops_data_) override;
|
||||
|
||||
impl_desc_type getImplType() const override {
|
||||
return implType;
|
||||
}
|
||||
|
||||
private:
|
||||
std::function<void()> exec_func;
|
||||
ReduceAttrs reduceAttrs;
|
||||
impl_desc_type implType = impl_desc_type::acl;
|
||||
|
||||
arm_compute::Coordinates axesMean;
|
||||
arm_compute::Tensor srcTensor;
|
||||
arm_compute::Tensor dstTensor;
|
||||
};
|
||||
|
||||
class AclReduceExecutorBuilder : public ReduceExecutorBuilder {
|
||||
public:
|
||||
bool isSupported(const ReduceAttrs& reduceAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs) const override {
|
||||
if (reduceAttrs.operation == Algorithm::ReduceMean) {
|
||||
if (srcDescs[0]->getPrecision() != dstDescs[0]->getPrecision() ||
|
||||
(srcDescs[0]->getPrecision() != InferenceEngine::Precision::FP32 &&
|
||||
srcDescs[0]->getPrecision() != InferenceEngine::Precision::FP16)) {
|
||||
DEBUG_LOG("NEReduceMean does not support precisions:",
|
||||
" src[0]=", srcDescs[0]->getPrecision(),
|
||||
" dst[0]=", dstDescs[0]->getPrecision());
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (srcDescs[0]->getPrecision() != dstDescs[0]->getPrecision() ||
|
||||
(srcDescs[0]->getPrecision() != InferenceEngine::Precision::FP32 &&
|
||||
srcDescs[0]->getPrecision() != InferenceEngine::Precision::FP16 &&
|
||||
srcDescs[0]->getPrecision() != InferenceEngine::Precision::I32)) {
|
||||
DEBUG_LOG("NEReductionOperation does not support precisions:",
|
||||
" src[0]=", srcDescs[0]->getPrecision(),
|
||||
" dst[0]=", dstDescs[0]->getPrecision());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
ReduceExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const override {
|
||||
return std::make_shared<AclReduceExecutor>(context);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
81
src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp
Normal file
81
src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp
Normal file
@ -0,0 +1,81 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
#pragma once
|
||||
|
||||
#include "ie_precision.hpp"
|
||||
#include "memory_desc/cpu_memory_desc.h"
|
||||
#include "arm_compute/core/Types.h"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
/**
|
||||
* @brief Return ComputeLibrary TensorShape with reverted layout schema used in ACL
|
||||
* @param dims vector of dimensions to convert
|
||||
* @return ComputeLibrary TensorShape object
|
||||
*/
|
||||
inline arm_compute::TensorShape shapeCast(const VectorDims& dims) {
|
||||
arm_compute::TensorShape tensorShape;
|
||||
for (std::size_t i = 0; i < dims.size(); ++i) {
|
||||
tensorShape.set(dims.size() - i - 1, dims[i], false);
|
||||
}
|
||||
if (tensorShape.num_dimensions() == 0) {
|
||||
tensorShape.set(0, 1, false);
|
||||
tensorShape.set_num_dimensions(1);
|
||||
}
|
||||
return tensorShape;
|
||||
}
|
||||
|
||||
inline std::size_t axisCast(const std::size_t axis, const std::size_t shapeSize) {
|
||||
return shapeSize - axis - 1;
|
||||
}
|
||||
|
||||
inline Dim vectorProduct(const VectorDims& vec, size_t size) {
|
||||
Dim prod = 1;
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
prod *= vec[i];
|
||||
return prod;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return ComputeLibrary DataType that corresponds to the given precision
|
||||
* @param precision precision to be converted
|
||||
* @return ComputeLibrary DataType or UNKNOWN if precision is not mapped to DataType
|
||||
*/
|
||||
inline arm_compute::DataType precisionToAclDataType(InferenceEngine::Precision precision) {
|
||||
switch (precision) {
|
||||
case InferenceEngine::Precision::I8: return arm_compute::DataType::S8;
|
||||
case InferenceEngine::Precision::U8: return arm_compute::DataType::U8;
|
||||
case InferenceEngine::Precision::I16: return arm_compute::DataType::S16;
|
||||
case InferenceEngine::Precision::U16: return arm_compute::DataType::U16;
|
||||
case InferenceEngine::Precision::I32: return arm_compute::DataType::S32;
|
||||
case InferenceEngine::Precision::U32: return arm_compute::DataType::U32;
|
||||
case InferenceEngine::Precision::FP16: return arm_compute::DataType::F16;
|
||||
case InferenceEngine::Precision::FP32: return arm_compute::DataType::F32;
|
||||
case InferenceEngine::Precision::FP64: return arm_compute::DataType::F64;
|
||||
case InferenceEngine::Precision::I64: return arm_compute::DataType::S64;
|
||||
case InferenceEngine::Precision::BF16: return arm_compute::DataType::BFLOAT16;
|
||||
default: return arm_compute::DataType::UNKNOWN;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return ComputeLibrary DataLayout that corresponds to MemoryDecs layout
|
||||
* @param desc MemoryDecs from which layout is retrieved
|
||||
* @param treatAs4D the flag that treats MemoryDecs as 4D shape
|
||||
* @return ComputeLibrary DataLayout or UNKNOWN if MemoryDecs layout is not mapped to DataLayout
|
||||
*/
|
||||
inline arm_compute::DataLayout getAclDataLayoutByMemoryDesc(MemoryDescCPtr desc) {
|
||||
if (desc->hasLayoutType(LayoutType::ncsp)) {
|
||||
if (desc->getShape().getRank() <= 4) return arm_compute::DataLayout::NCHW;
|
||||
if (desc->getShape().getRank() == 5) return arm_compute::DataLayout::NCDHW;
|
||||
} else if (desc->hasLayoutType(LayoutType::nspc)) {
|
||||
if (desc->getShape().getRank() <= 4) return arm_compute::DataLayout::NHWC;
|
||||
if (desc->getShape().getRank() == 5) return arm_compute::DataLayout::NDHWC;
|
||||
}
|
||||
return arm_compute::DataLayout::UNKNOWN;
|
||||
}
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
15
src/plugins/intel_cpu/src/nodes/executors/eltwise.cpp
Normal file
15
src/plugins/intel_cpu/src/nodes/executors/eltwise.cpp
Normal file
@ -0,0 +1,15 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0mvn
|
||||
//
|
||||
|
||||
#include "eltwise.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
using namespace InferenceEngine;
|
||||
|
||||
EltwiseExecutor::EltwiseExecutor(const ExecutorContext::CPtr context) : context(context) {}
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
109
src/plugins/intel_cpu/src/nodes/executors/eltwise.hpp
Normal file
109
src/plugins/intel_cpu/src/nodes/executors/eltwise.hpp
Normal file
@ -0,0 +1,109 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "cpu_memory.h"
|
||||
#include "onednn/iml_type_mapper.h"
|
||||
#include "executor.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
struct EltwiseAttrs {
|
||||
Algorithm algorithm;
|
||||
float alpha;
|
||||
float beta;
|
||||
float gamma;
|
||||
|
||||
EltwiseAttrs() : algorithm(Algorithm::Default), alpha(0), beta(0), gamma(0) {}
|
||||
EltwiseAttrs(Algorithm algorithm, float alpha, float beta, float gamma) : algorithm(algorithm), alpha(alpha), beta(beta), gamma(gamma) {}
|
||||
|
||||
bool operator==(const EltwiseAttrs& rhs) const {
|
||||
bool retVal = true;
|
||||
retVal = algorithm == rhs.algorithm &&
|
||||
alpha == rhs.alpha &&
|
||||
beta == rhs.beta &&
|
||||
gamma == rhs.gamma;
|
||||
|
||||
return retVal;
|
||||
}
|
||||
};
|
||||
|
||||
enum class EltwisePostOpType {
|
||||
Undefined,
|
||||
Eltwise,
|
||||
Dnnl
|
||||
};
|
||||
|
||||
class EltwisePostOp {
|
||||
public:
|
||||
EltwisePostOp(EltwiseAttrs eltwise) {
|
||||
type = EltwisePostOpType::Eltwise;
|
||||
this->eltwise = eltwise;
|
||||
}
|
||||
|
||||
EltwisePostOp(dnnl::post_ops dnnlPostOps) {
|
||||
type = EltwisePostOpType::Dnnl;
|
||||
this->dnnlPostOps = dnnlPostOps;
|
||||
}
|
||||
|
||||
~EltwisePostOp() = default;
|
||||
|
||||
EltwiseAttrs eltwise;
|
||||
dnnl::post_ops dnnlPostOps;
|
||||
|
||||
EltwisePostOpType type = EltwisePostOpType::Undefined;
|
||||
|
||||
bool operator==(const EltwisePostOp &rhs) const {
|
||||
if (type != rhs.type) { return false; }
|
||||
bool ret = true;
|
||||
switch (type) {
|
||||
case EltwisePostOpType::Eltwise:
|
||||
ret = eltwise == rhs.eltwise;
|
||||
break;
|
||||
case EltwisePostOpType::Dnnl:
|
||||
ret = dnnlPostOps == rhs.dnnlPostOps;
|
||||
break;
|
||||
default: assert(!"unsupported eltwise post operation type");
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
|
||||
class EltwiseExecutor {
|
||||
public:
|
||||
EltwiseExecutor(const ExecutorContext::CPtr context);
|
||||
virtual bool init(const EltwiseAttrs& eltwiseAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs,
|
||||
const std::vector<EltwisePostOp>& postOps) = 0;
|
||||
|
||||
virtual void exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, const void *post_ops_data_) = 0;
|
||||
virtual ~EltwiseExecutor() = default;
|
||||
|
||||
virtual impl_desc_type getImplType() const = 0;
|
||||
|
||||
protected:
|
||||
EltwiseAttrs eltwiseAttrs;
|
||||
const ExecutorContext::CPtr context;
|
||||
};
|
||||
|
||||
using EltwiseExecutorPtr = std::shared_ptr<EltwiseExecutor>;
|
||||
using EltwiseExecutorCPtr = std::shared_ptr<const EltwiseExecutor>;
|
||||
|
||||
class EltwiseExecutorBuilder {
|
||||
public:
|
||||
~EltwiseExecutorBuilder() = default;
|
||||
virtual bool isSupported(const EltwiseAttrs& eltwiseAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs) const = 0;
|
||||
virtual EltwiseExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const = 0;
|
||||
};
|
||||
|
||||
using EltwiseExecutorBuilderPtr = std::shared_ptr<EltwiseExecutorBuilder>;
|
||||
using EltwiseExecutorBuilderCPtr = std::shared_ptr<const EltwiseExecutorBuilder>;
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
19
src/plugins/intel_cpu/src/nodes/executors/eltwise_list.cpp
Normal file
19
src/plugins/intel_cpu/src/nodes/executors/eltwise_list.cpp
Normal file
@ -0,0 +1,19 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "eltwise_list.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
const std::vector<EltwiseExecutorDesc>& getEltwiseExecutorsList() {
|
||||
static std::vector<EltwiseExecutorDesc> descs = {
|
||||
OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared<AclEltwiseExecutorBuilder>())
|
||||
};
|
||||
|
||||
return descs;
|
||||
}
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
84
src/plugins/intel_cpu/src/nodes/executors/eltwise_list.hpp
Normal file
84
src/plugins/intel_cpu/src/nodes/executors/eltwise_list.hpp
Normal file
@ -0,0 +1,84 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "executor.hpp"
|
||||
|
||||
#include "eltwise.hpp"
|
||||
#if defined(OV_CPU_WITH_ACL)
|
||||
#include "acl/acl_eltwise.hpp"
|
||||
#endif
|
||||
|
||||
#include "onednn/iml_type_mapper.h"
|
||||
#include "common/primitive_cache.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
struct EltwiseExecutorDesc {
|
||||
ExecutorType executorType;
|
||||
EltwiseExecutorBuilderCPtr builder;
|
||||
};
|
||||
|
||||
const std::vector<EltwiseExecutorDesc>& getEltwiseExecutorsList();
|
||||
|
||||
class EltwiseExecutorFactory : public ExecutorFactory {
|
||||
public:
|
||||
EltwiseExecutorFactory(const EltwiseAttrs& eltwiseAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs,
|
||||
const ExecutorContext::CPtr context) : ExecutorFactory(context) {
|
||||
for (auto& desc : getEltwiseExecutorsList()) {
|
||||
if (desc.builder->isSupported(eltwiseAttrs, srcDescs, dstDescs)) {
|
||||
supportedDescs.push_back(desc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
~EltwiseExecutorFactory() = default;
|
||||
virtual EltwiseExecutorPtr makeExecutor(const EltwiseAttrs& eltwiseAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs,
|
||||
const std::vector<EltwisePostOp>& postOps) {
|
||||
auto build = [&](const EltwiseExecutorDesc* desc) {
|
||||
auto executor = desc->builder->makeExecutor(context);
|
||||
if (executor->init(eltwiseAttrs, srcDescs, dstDescs, postOps)) {
|
||||
return executor;
|
||||
}
|
||||
|
||||
EltwiseExecutorPtr ptr = nullptr;
|
||||
return ptr;
|
||||
};
|
||||
|
||||
if (chosenDesc) {
|
||||
if (auto executor = build(chosenDesc)) {
|
||||
return executor;
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto& sd : supportedDescs) {
|
||||
if (auto executor = build(&sd)) {
|
||||
chosenDesc = &sd;
|
||||
return executor;
|
||||
}
|
||||
}
|
||||
|
||||
IE_THROW() << "Supported Eltwise executor is not found";
|
||||
}
|
||||
|
||||
bool isEmpty() {
|
||||
return supportedDescs.empty();
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<EltwiseExecutorDesc> supportedDescs;
|
||||
const EltwiseExecutorDesc* chosenDesc = nullptr;
|
||||
};
|
||||
|
||||
using EltwiseExecutorFactoryPtr = std::shared_ptr<EltwiseExecutorFactory>;
|
||||
using EltwiseExecutorFactoryCPtr = std::shared_ptr<const EltwiseExecutorFactory>;
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
95
src/plugins/intel_cpu/src/nodes/executors/executor.hpp
Normal file
95
src/plugins/intel_cpu/src/nodes/executors/executor.hpp
Normal file
@ -0,0 +1,95 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "cache/multi_cache.h"
|
||||
#include "graph_context.h"
|
||||
#include "onednn/iml_type_mapper.h"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
#if defined(OV_CPU_WITH_ACL)
|
||||
#define OV_CPU_INSTANCE_ACL(...) \
|
||||
{__VA_ARGS__},
|
||||
#else
|
||||
#define OV_CPU_INSTANCE_ACL(...)
|
||||
#endif
|
||||
|
||||
#if defined(OV_CPU_WITH_DNNL)
|
||||
#define OV_CPU_INSTANCE_DNNL(...) \
|
||||
{__VA_ARGS__},
|
||||
#else
|
||||
#define OV_CPU_INSTANCE_DNNL(...)
|
||||
#endif
|
||||
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
#define OV_CPU_INSTANCE_X64(...) \
|
||||
{__VA_ARGS__},
|
||||
#else
|
||||
#define OV_CPU_INSTANCE_X64(...)
|
||||
#endif
|
||||
|
||||
#define OV_CPU_INSTANCE_COMMON(...) \
|
||||
{__VA_ARGS__},
|
||||
|
||||
enum class ExecutorType {
|
||||
Undefined,
|
||||
Common,
|
||||
x64,
|
||||
Dnnl,
|
||||
Acl
|
||||
};
|
||||
|
||||
class ExecutorContext {
|
||||
public:
|
||||
typedef std::shared_ptr<ExecutorContext> Ptr;
|
||||
typedef std::shared_ptr<const ExecutorContext> CPtr;
|
||||
|
||||
ExecutorContext(const GraphContext::CPtr graphContext, const std::vector<impl_desc_type>& implPriorities) {
|
||||
this->runtimeCache = graphContext->getParamsCache();
|
||||
this->scratchPad = graphContext->getScratchPad();
|
||||
this->engine = graphContext->getEngine();
|
||||
this->implPriorities = implPriorities;
|
||||
}
|
||||
|
||||
MultiCacheWeakPtr getRuntimeCache() const {
|
||||
return runtimeCache;
|
||||
}
|
||||
|
||||
DnnlScratchPadPtr getScratchPad() const {
|
||||
return scratchPad;
|
||||
}
|
||||
|
||||
dnnl::engine getEngine() const {
|
||||
return engine;
|
||||
}
|
||||
|
||||
const std::vector<impl_desc_type>& getImplPriorities() const {
|
||||
return implPriorities;
|
||||
}
|
||||
|
||||
private:
|
||||
// weak_ptr is required to avoid cycle dependencies with MultiCache
|
||||
// since ExecutorContext is stored in Executor itself
|
||||
MultiCacheWeakPtr runtimeCache;
|
||||
DnnlScratchPadPtr scratchPad = nullptr;
|
||||
dnnl::engine engine;
|
||||
std::vector<impl_desc_type> implPriorities = {};
|
||||
};
|
||||
|
||||
class ExecutorFactory {
|
||||
public:
|
||||
ExecutorFactory(const ExecutorContext::CPtr context) : context(context) {}
|
||||
virtual ~ExecutorFactory() = default;
|
||||
|
||||
const ExecutorContext::CPtr context;
|
||||
};
|
||||
|
||||
using ExecutorFactoryPtr = std::shared_ptr<ExecutorFactory>;
|
||||
using ExecutorFactoryCPtr = std::shared_ptr<const ExecutorFactory>;
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
528
src/plugins/intel_cpu/src/nodes/executors/interpolate.cpp
Normal file
528
src/plugins/intel_cpu/src/nodes/executors/interpolate.cpp
Normal file
@ -0,0 +1,528 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "interpolate.hpp"
|
||||
#include "ie_parallel.hpp"
|
||||
#include "nodes/common/cpu_memcpy.h"
|
||||
#include "emitters/x64/jit_load_store_emitters.hpp"
|
||||
|
||||
bool ov::intel_cpu::InterpolateExecutor::init(const InterpolateAttrs& interpolateAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs,
|
||||
const dnnl::primitive_attr &attr) {
|
||||
const auto &srcDims = srcDescs[0]->getShape().getStaticDims();
|
||||
const auto &dstDims = dstDescs[0]->getShape().getStaticDims();
|
||||
interpAttrs = interpolateAttrs;
|
||||
srcDimPad5d = to5Dim(getPaddedInputShape(srcDims, interpolateAttrs.padBegin, interpolateAttrs.padEnd));
|
||||
dstDim5d = to5Dim(dstDims);
|
||||
srcDataSize = interpolateAttrs.inPrc.size();
|
||||
dstDataSize = interpolateAttrs.outPrc.size();
|
||||
dataRank = srcDims.size();
|
||||
spatialDimSize = getSpatialDimsNum(dataRank);
|
||||
|
||||
switch (interpAttrs.mode) {
|
||||
case InterpolateMode::nearest: {
|
||||
buildTblNN(srcDimPad5d, dstDim5d, interpAttrs.dataScales, interpolateAttrs.layout, interpolateAttrs.nearestMode);
|
||||
break;
|
||||
}
|
||||
case InterpolateMode::linear_onnx: {
|
||||
buildTblLinearOnnx(srcDimPad5d, dstDim5d, interpAttrs.dataScales, interpolateAttrs.layout);
|
||||
break;
|
||||
}
|
||||
case InterpolateMode::linear: {
|
||||
static constexpr int LINEAR_KERNEL = 2;
|
||||
buildTblLinear(srcDimPad5d, dstDim5d, interpAttrs.dataScales, LINEAR_KERNEL, interpolateAttrs.antialias);
|
||||
break;
|
||||
}
|
||||
case InterpolateMode::cubic: {
|
||||
buildTblCubic(srcDimPad5d, dstDim5d, interpAttrs.dataScales, interpolateAttrs.cubeCoeff, interpolateAttrs.layout);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
IE_THROW() << "Interpolate executor does not support interpolate mode: " << interpAttrs.mode;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
// =====================================================================================================================
|
||||
// index layout:
|
||||
// d_0............d_OD-1, h_0..............h_OH-1, w_0................w_OW-1
|
||||
void ov::intel_cpu::InterpolateExecutor::buildTblNN(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d,
|
||||
const std::vector<float>& dataScales, InterpolateLayoutType layout, InterpolateNearestMode nearestMode) {
|
||||
const int dimSize = dataRank;
|
||||
float fz = (dimSize == 5) ? dataScales[dimSize - 3] : 1.f;
|
||||
float fy = dataScales[dimSize - 2];
|
||||
float fx = dataScales[dimSize - 1];
|
||||
size_t ID = srcDimPad5d[2], IH = srcDimPad5d[3], IW = srcDimPad5d[4];
|
||||
size_t OD = dstDim5d[2], OH = dstDim5d[3], OW = dstDim5d[4];
|
||||
|
||||
indexTable.resize(OD + OH + OW);
|
||||
bool isDDownsample = (fz < 1) ? true : false;
|
||||
bool isHDownsample = (fy < 1) ? true : false;
|
||||
bool isWDownsample = (fx < 1) ? true : false;
|
||||
for (int oz = 0; oz < OD; oz++) {
|
||||
float iz = coordTransToInput(oz, fz, ID, OD);
|
||||
indexTable[oz] = nearestRound(iz, isDDownsample, nearestMode);
|
||||
indexTable[oz] = clipCoord(indexTable[oz], ID);
|
||||
}
|
||||
for (int oy = 0; oy < OH; oy++) {
|
||||
float iy = coordTransToInput(oy, fy, IH, OH);
|
||||
indexTable[OD + oy] = nearestRound(iy, isHDownsample, nearestMode);
|
||||
indexTable[OD + oy] = clipCoord(indexTable[OD + oy], IH);
|
||||
}
|
||||
for (int ox = 0; ox < OW; ox++) {
|
||||
float ix = coordTransToInput(ox, fx, IW, OW);
|
||||
indexTable[OD + OH + ox] = nearestRound(ix, isWDownsample, nearestMode);
|
||||
indexTable[OD + OH + ox] = clipCoord(indexTable[OD + OH + ox], IW);
|
||||
}
|
||||
}
|
||||
|
||||
// scale is float(outShape) / float(inShape)
|
||||
// strictly consistent with onnx calc manner(div scale, not multiply inverse), given this is done offline
|
||||
// the slight precison diff can produce obvious wrong value due to "nearest round" behavior for NN mode
|
||||
float ov::intel_cpu::InterpolateExecutor::coordTransToInput(int outCoord, float scale, int inShape, int outShape) const {
|
||||
if (scale == 1.0f || (inShape == outShape)) {
|
||||
return outCoord;
|
||||
}
|
||||
switch (interpAttrs.coordTransMode) {
|
||||
case InterpolateCoordTransMode::half_pixel: {
|
||||
return (outCoord + 0.5f) / scale - 0.5f;
|
||||
break;
|
||||
}
|
||||
case InterpolateCoordTransMode::pytorch_half_pixel: {
|
||||
if (outShape > 1)
|
||||
return (outCoord + 0.5f) / scale - 0.5f;
|
||||
else
|
||||
return 0;
|
||||
break;
|
||||
}
|
||||
case InterpolateCoordTransMode::asymmetric: {
|
||||
return static_cast<float>(outCoord) / scale;
|
||||
break;
|
||||
}
|
||||
case InterpolateCoordTransMode::tf_half_pixel_for_nn: {
|
||||
return (outCoord + 0.5f) / scale;
|
||||
break;
|
||||
}
|
||||
case InterpolateCoordTransMode::align_corners: {
|
||||
if (outShape > 1)
|
||||
return outCoord * (static_cast<float>(inShape - 1) / static_cast<float>(outShape - 1));
|
||||
else
|
||||
return 0;
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
IE_THROW() << "errorPrefix" << " does not support specified coordinate transformation mode";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int ov::intel_cpu::InterpolateExecutor::nearestRound(float originCoord, bool isDownsample, InterpolateNearestMode nearestMode) const {
|
||||
switch (nearestMode) {
|
||||
case InterpolateNearestMode::round_prefer_floor: {
|
||||
if (originCoord == (static_cast<int>(originCoord) + 0.5f))
|
||||
return static_cast<int>(std::floor(originCoord));
|
||||
else
|
||||
return static_cast<int>(std::round(originCoord));
|
||||
break;
|
||||
}
|
||||
case InterpolateNearestMode::round_prefer_ceil: {
|
||||
return static_cast<int>(std::round(originCoord));
|
||||
break;
|
||||
}
|
||||
case InterpolateNearestMode::floor: {
|
||||
return static_cast<int>(std::floor(originCoord));
|
||||
break;
|
||||
}
|
||||
case InterpolateNearestMode::ceil: {
|
||||
return static_cast<int>(std::ceil(originCoord));
|
||||
break;
|
||||
}
|
||||
case InterpolateNearestMode::simple: {
|
||||
if (isDownsample)
|
||||
return static_cast<int>(std::ceil(originCoord));
|
||||
else
|
||||
return static_cast<int>(originCoord);
|
||||
}
|
||||
default: {
|
||||
IE_THROW() << "errorPrefix" << " does not support specified nearest round mode";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ov::intel_cpu::InterpolateExecutor::linearOnnxCF(int outCoord, float scale, int inShape, int outShape,
|
||||
int& index0, int& index1, float& weight0, float& weight1) {
|
||||
float inCoord = coordTransToInput(outCoord, scale, inShape, outShape);
|
||||
inCoord = std::max(0.0f, std::min(inCoord, static_cast<float>(inShape - 1)));
|
||||
index0 = std::min(static_cast<int>(inCoord), inShape - 1);
|
||||
index1 = std::min(index0 + 1, inShape - 1);
|
||||
|
||||
weight1 = std::fabs(inCoord - index0);
|
||||
weight0 = std::fabs(inCoord - index1);
|
||||
if (index0 == index1) {
|
||||
weight0 = 0.5f;
|
||||
weight1 = 0.5f;
|
||||
}
|
||||
}
|
||||
|
||||
void ov::intel_cpu::InterpolateExecutor::buildTblLinearOnnx(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d,
|
||||
const std::vector<float>& dataScales, InterpolateLayoutType layout) {
|
||||
int dimSize = dataRank;
|
||||
float fz = (spatialDimSize > 2) ? dataScales[dimSize - 3] : 1.f;
|
||||
float fy = (spatialDimSize > 1) ? dataScales[dimSize - 2] : 1.f;
|
||||
float fx = dataScales[dimSize - 1];
|
||||
int ID = srcDimPad5d[2], IH = srcDimPad5d[3], IW = srcDimPad5d[4];
|
||||
int OD = dstDim5d[2], OH = dstDim5d[3], OW = dstDim5d[4];
|
||||
|
||||
std::vector<int*> indexPtr(MAX_INPUT_INTERPOLATE, 0);
|
||||
std::vector<float*> weightPtr(MAX_INPUT_INTERPOLATE, 0);
|
||||
if (layout == InterpolateLayoutType::planar) {
|
||||
// FrontTopLeft:0, FrontTopRight:1, FrontBottomLeft:2, FrontBottomRight:3,
|
||||
// EndTopLeft:4, EndTopRight:5, EndBottomLeft:6, EndBottomRight:7
|
||||
// weight: Left:0, ritht:1, top:2, bottom:3, front:4, end:5
|
||||
int eltInGrid = (spatialDimSize > 2) ? MAX_INPUT_INTERPOLATE : ((spatialDimSize > 1) ? 4 : 2);
|
||||
int idxType = 2;
|
||||
int scratchLen = rnd_up(eltInGrid * OW * OH * OD, 16);
|
||||
indexTable.resize(idxType * scratchLen);
|
||||
|
||||
indexPtr[0] = static_cast<int*>(&indexTable[0]);
|
||||
indexPtr[1] = static_cast<int*>(&indexTable[OW * OH * OD]);
|
||||
weightPtr[0] = reinterpret_cast<float*>(&indexTable[scratchLen]);
|
||||
weightPtr[1] = reinterpret_cast<float*>(&indexTable[scratchLen + OW * OH * OD]);
|
||||
if (spatialDimSize > 1) {
|
||||
indexPtr[2] = static_cast<int*>(&indexTable[2 * OW * OH * OD]);
|
||||
indexPtr[3] = static_cast<int*>(&indexTable[3 * OW * OH * OD]);
|
||||
weightPtr[2] = reinterpret_cast<float*>(&indexTable[scratchLen + 2 * OW * OH * OD]);
|
||||
weightPtr[3] = reinterpret_cast<float*>(&indexTable[scratchLen + 3 * OW * OH * OD]);
|
||||
}
|
||||
if (spatialDimSize > 2) {
|
||||
indexPtr[4] = static_cast<int*>(&indexTable[4 * OW * OH * OD]);
|
||||
indexPtr[5] = static_cast<int*>(&indexTable[5 * OW * OH * OD]);
|
||||
indexPtr[6] = static_cast<int*>(&indexTable[6 * OW * OH * OD]);
|
||||
indexPtr[7] = static_cast<int*>(&indexTable[7 * OW * OH * OD]);
|
||||
weightPtr[4] = reinterpret_cast<float*>(&indexTable[scratchLen + 4 * OW * OH * OD]);
|
||||
weightPtr[5] = reinterpret_cast<float*>(&indexTable[scratchLen + 5 * OW * OH * OD]);
|
||||
}
|
||||
int scale = dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::sse41) ? srcDataSize : 1;
|
||||
|
||||
for (int oz = 0; oz < OD; oz++) {
|
||||
int izF, izE;
|
||||
float weightF, weightE;
|
||||
linearOnnxCF(oz, fz, ID, OD, izF, izE, weightF, weightE);
|
||||
int idxOz = oz * OH * OW;
|
||||
for (int oy = 0; oy < OH; oy++) {
|
||||
int iyT, iyB;
|
||||
float weightT, weightB;
|
||||
linearOnnxCF(oy, fy, IH, OH, iyT, iyB, weightT, weightB);
|
||||
int idxOzOy = idxOz + oy * OW;
|
||||
for (int ox = 0; ox < OW; ox++) {
|
||||
int ixL, ixR;
|
||||
float weightL, weightR;
|
||||
linearOnnxCF(ox, fx, IW, OW, ixL, ixR, weightL, weightR);
|
||||
|
||||
int idxOzOyOx = idxOzOy + ox;
|
||||
indexPtr[0][idxOzOyOx] = (izF * IH * IW + iyT * IW + ixL) * scale;
|
||||
indexPtr[1][idxOzOyOx] = (izF * IH * IW + iyT * IW + ixR) * scale;
|
||||
weightPtr[0][idxOzOyOx] = weightL;
|
||||
weightPtr[1][idxOzOyOx] = weightR;
|
||||
if (spatialDimSize > 1) {
|
||||
indexPtr[2][idxOzOyOx] = (izF * IH * IW + iyB * IW + ixL) * scale;
|
||||
indexPtr[3][idxOzOyOx] = (izF * IH * IW + iyB * IW + ixR) * scale;
|
||||
weightPtr[2][idxOzOyOx] = weightT;
|
||||
weightPtr[3][idxOzOyOx] = weightB;
|
||||
}
|
||||
if (spatialDimSize > 2) {
|
||||
indexPtr[4][idxOzOyOx] = (izE * IH * IW + iyT * IW + ixL) * scale;
|
||||
indexPtr[5][idxOzOyOx] = (izE * IH * IW + iyT * IW + ixR) * scale;
|
||||
indexPtr[6][idxOzOyOx] = (izE * IH * IW + iyB * IW + ixL) * scale;
|
||||
indexPtr[7][idxOzOyOx] = (izE * IH * IW + iyB * IW + ixR) * scale;
|
||||
weightPtr[4][idxOzOyOx] = weightF;
|
||||
weightPtr[5][idxOzOyOx] = weightE;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// index: left:OW right:OW Top:OH Bottom:OH, Front:OD, End:OD
|
||||
// weight:same as index
|
||||
size_t scratchLen = rnd_up(OW + OW + OH + OH + OD + OD, 16);
|
||||
int idxType = 2;
|
||||
indexTable.resize(idxType * scratchLen);
|
||||
indexPtr[0] = static_cast<int*>(&indexTable[0]);
|
||||
indexPtr[1] = static_cast<int*>(&indexTable[OW]);
|
||||
indexPtr[2] = static_cast<int*>(&indexTable[2 * OW]);
|
||||
indexPtr[3] = static_cast<int*>(&indexTable[2 * OW + OH]);
|
||||
indexPtr[4] = static_cast<int*>(&indexTable[2 * OW + 2 * OH]);
|
||||
indexPtr[5] = static_cast<int*>(&indexTable[2 * OW + 2 * OH + OD]);
|
||||
|
||||
weightPtr[0] = reinterpret_cast<float*>(&indexTable[scratchLen]);
|
||||
weightPtr[1] = reinterpret_cast<float*>(&indexTable[scratchLen + OW]);
|
||||
weightPtr[2] = reinterpret_cast<float*>(&indexTable[scratchLen + 2 * OW]);
|
||||
weightPtr[3] = reinterpret_cast<float*>(&indexTable[scratchLen + 2 * OW + OH]);
|
||||
weightPtr[4] = reinterpret_cast<float*>(&indexTable[scratchLen + 2 * OW + 2 * OH]);
|
||||
weightPtr[5] = reinterpret_cast<float*>(&indexTable[scratchLen + 2 * OW + 2 * OH + OD]);
|
||||
|
||||
for (int ox = 0; ox < OW; ox++) {
|
||||
linearOnnxCF(ox, fx, IW, OW, indexPtr[0][ox], indexPtr[1][ox], weightPtr[0][ox], weightPtr[1][ox]);
|
||||
}
|
||||
for (int oy = 0; oy < OH; oy++) {
|
||||
linearOnnxCF(oy, fy, IH, OH, indexPtr[2][oy], indexPtr[3][oy], weightPtr[2][oy], weightPtr[3][oy]);
|
||||
}
|
||||
for (int oz = 0; oz < OD; oz++) {
|
||||
linearOnnxCF(oz, fz, ID, OD, indexPtr[4][oz], indexPtr[5][oz], weightPtr[4][oz], weightPtr[5][oz]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// table layout:
|
||||
// wd .........wd, wh............wh, ww.............ww, id...........id, ih............ih, iw..............iw
|
||||
// | |
|
||||
// wh0.....wh_diameter ih0.....ih_diameter
|
||||
void ov::intel_cpu::InterpolateExecutor::buildTblLinear(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d,
|
||||
const std::vector<float>& dataScales, int kernel_width, bool antialias) {
|
||||
int dimSize = dataRank;
|
||||
float fz = (dimSize == 5) ? dataScales[dimSize - 3] : 1.f;
|
||||
float fy = dataScales[dimSize - 2];
|
||||
float fx = dataScales[dimSize - 1];
|
||||
size_t ID = srcDimPad5d[2], IH = srcDimPad5d[3], IW = srcDimPad5d[4];
|
||||
size_t OD = dstDim5d[2], OH = dstDim5d[3], OW = dstDim5d[4];
|
||||
|
||||
if (!(IW == OW && IH == OH && ID == OD)) {
|
||||
float ax = antialias ? fx : 1.0f;
|
||||
float ay = antialias ? fy : 1.0f;
|
||||
float az = antialias ? fz : 1.0f;
|
||||
|
||||
int rx = (fx > 1.0f) ? 2 : static_cast<int>(ceil(static_cast<float>(kernel_width) / ax));
|
||||
int ry = (fy > 1.0f) ? 2 : static_cast<int>(ceil(static_cast<float>(kernel_width) / ay));
|
||||
int rz = (fz > 1.0f) ? 2 : static_cast<int>(ceil(static_cast<float>(kernel_width) / az));
|
||||
|
||||
int diaOD = 2 * rz + 1;
|
||||
int diaOH = 2 * ry + 1;
|
||||
int diaOW = 2 * rx + 1;
|
||||
int sizeOD = OD * diaOD;
|
||||
int sizeOH = OH * diaOH;
|
||||
int sizeOW = OW * diaOW;
|
||||
indexTable.resize((sizeOD + sizeOH + sizeOW) * 2);
|
||||
float *weightTable = reinterpret_cast<float*>(&indexTable[0]);
|
||||
float *weightOD = static_cast<float*>(&weightTable[0]);
|
||||
float *weightOH = static_cast<float*>(&weightTable[sizeOD]);
|
||||
float *weightOW = static_cast<float*>(&weightTable[sizeOD + sizeOH]);
|
||||
|
||||
int *idxTable = static_cast<int*>(&indexTable[sizeOD + sizeOH + sizeOW]);
|
||||
int *idxOD = static_cast<int*>(&idxTable[0]);
|
||||
int *idxOH = static_cast<int*>(&idxTable[sizeOD]);
|
||||
int *idxOW = static_cast<int*>(&idxTable[sizeOD + sizeOH]);
|
||||
|
||||
for (int oz = 0; oz < OD; oz++) {
|
||||
float iz = coordTransToInput(oz, fz, ID, OD);
|
||||
int iz_r = static_cast<int>(std::round(iz));
|
||||
for (int r = iz_r - rz, i = 0; r <= iz_r + rz; r++, i++) {
|
||||
idxOD[oz * diaOD + i] = r;
|
||||
if (r < 0 || r >= static_cast<int>(ID)) {
|
||||
weightOD[oz * diaOD + i] = 0.f;
|
||||
} else {
|
||||
float dz = iz - r;
|
||||
weightOD[oz * diaOD + i] = az * triangleCoeff(az * dz);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int oy = 0; oy < OH; oy++) {
|
||||
float iy = coordTransToInput(oy, fy, IH, OH);
|
||||
int iy_r = static_cast<int>(std::round(iy));
|
||||
for (int r = iy_r - ry, i = 0; r <= iy_r + ry; r++, i++) {
|
||||
idxOH[oy * diaOH + i] = r;
|
||||
if (r < 0 || r >= static_cast<int>(IH)) {
|
||||
weightOH[oy * diaOH + i] = 0.f;
|
||||
} else {
|
||||
float dy = iy - r;
|
||||
weightOH[oy * diaOH + i] = ay * triangleCoeff(ay * dy);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int ox = 0; ox < OW; ox++) {
|
||||
float ix = coordTransToInput(ox, fx, IW, OW);
|
||||
int ix_r = static_cast<int>(std::round(ix));
|
||||
for (int r = ix_r - rx, i = 0; r <= ix_r + rx; r++, i++) {
|
||||
idxOW[ox * diaOW + i] = r;
|
||||
if (r < 0 || r >= static_cast<int>(IW)) {
|
||||
weightOW[ox * diaOW + i] = 0.f;
|
||||
} else {
|
||||
float dx = ix - r;
|
||||
weightOW[ox * diaOW + i] = ax * triangleCoeff(ax * dx);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<float> ov::intel_cpu::InterpolateExecutor::getCubicCoeffs(float mantissa, float a) {
|
||||
float m = std::fabs(mantissa);
|
||||
std::vector<float> coeffs(4, 0.f);
|
||||
|
||||
coeffs[0] = a * (m - 1.0) * (m - 1.0) * m;
|
||||
coeffs[1] = ((a + 2.0) * m - (a + 3.0)) * m * m + 1.0;
|
||||
coeffs[2] = (((-a - 2.0) * m + (2.0 * a + 3.0)) * m - a) * m;
|
||||
coeffs[3] = -a * m * m * (m - 1.0);
|
||||
return coeffs;
|
||||
}
|
||||
|
||||
// table layout:
|
||||
// OW OW OW OW OW OH OH OH OH OH
|
||||
// x_idx x_weight0 x_weight1 x_weight2 x_weight3 y_idx y_weight0 y_weight1 y_weight2 y_weight3
|
||||
void ov::intel_cpu::InterpolateExecutor::buildTblCubic(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, const std::vector<float>& dataScales,
|
||||
float cubicCoeff, InterpolateLayoutType layout) {
|
||||
int dimSize = dataRank;
|
||||
float fy = dataScales[dimSize - 2];
|
||||
float fx = dataScales[dimSize - 1];
|
||||
int IH = srcDimPad5d[3], IW = srcDimPad5d[4];
|
||||
int OH = dstDim5d[3], OW = dstDim5d[4];
|
||||
|
||||
// idxNum for index, CUBIC_GRID_LEN for weight
|
||||
const int idxNum = 1;
|
||||
size_t idxWeightSize = (CUBIC_GRID_LEN + idxNum) * OW + (CUBIC_GRID_LEN + idxNum) * OH;
|
||||
if (layout != InterpolateLayoutType::planar) {
|
||||
indexTable.resize(idxWeightSize);
|
||||
} else {
|
||||
size_t sequenceSize = 2 * OH * OW;
|
||||
indexTable.resize(idxWeightSize + sequenceSize);
|
||||
}
|
||||
|
||||
int tblAdvance = 0;
|
||||
int *xOrigin = static_cast<int*>(&indexTable[tblAdvance]);
|
||||
tblAdvance += OW;
|
||||
float *xFactor = reinterpret_cast<float*>(&indexTable[tblAdvance]);
|
||||
for (int ox = 0; ox < OW; ox++) {
|
||||
float ix = coordTransToInput(ox, fx, IW, OW);
|
||||
int ix_r = static_cast<int>(std::floor(ix));
|
||||
xOrigin[ox] = ix_r;
|
||||
float m = ix - ix_r;
|
||||
std::vector<float> coffes = getCubicCoeffs(m, cubicCoeff);
|
||||
xFactor[CUBIC_GRID_LEN * ox] = coffes[0];
|
||||
xFactor[CUBIC_GRID_LEN * ox + 1] = coffes[1];
|
||||
xFactor[CUBIC_GRID_LEN * ox + 2] = coffes[2];
|
||||
xFactor[CUBIC_GRID_LEN * ox + 3] = coffes[3];
|
||||
}
|
||||
|
||||
tblAdvance += CUBIC_GRID_LEN * OW;
|
||||
int *yOrigin = static_cast<int*>(&indexTable[tblAdvance]);
|
||||
tblAdvance += OH;
|
||||
float *yFactor = reinterpret_cast<float*>(&indexTable[tblAdvance]);
|
||||
for (int oy = 0; oy < OH; oy++) {
|
||||
float iy = coordTransToInput(oy, fy, IH, OH);
|
||||
int iy_r = static_cast<int>(std::floor(iy));
|
||||
yOrigin[oy] = iy_r;
|
||||
float m = iy - iy_r;
|
||||
std::vector<float> coffes = getCubicCoeffs(m, cubicCoeff);
|
||||
yFactor[CUBIC_GRID_LEN * oy] = coffes[0];
|
||||
yFactor[CUBIC_GRID_LEN * oy + 1] = coffes[1];
|
||||
yFactor[CUBIC_GRID_LEN * oy + 2] = coffes[2];
|
||||
yFactor[CUBIC_GRID_LEN * oy + 3] = coffes[3];
|
||||
}
|
||||
|
||||
if (layout == InterpolateLayoutType::planar) {
|
||||
tblAdvance += CUBIC_GRID_LEN * OH;
|
||||
int *sequenceOH = static_cast<int*>(&indexTable[tblAdvance]);
|
||||
tblAdvance += OH * OW;
|
||||
int *sequenceOW = static_cast<int*>(&indexTable[tblAdvance]);
|
||||
for (int h = 0; h < OH; ++h) {
|
||||
int offset = h * OW;
|
||||
for (int w = 0; w < OW; ++w) {
|
||||
sequenceOH[offset + w] = h * sizeof(int);
|
||||
sequenceOW[offset + w] = w * sizeof(int);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// shapeND: n c d h w
|
||||
// blockND: ncdhw cdhw dhw hw w 1
|
||||
// index : 0 1 2 3 4 5
|
||||
inline SizeVector getBlockND(const SizeVector& shape) {
|
||||
int shapeRank = shape.size();
|
||||
SizeVector blockND(shapeRank + 1, 1);
|
||||
for (int i = shapeRank - 1; i >= 0; i--) {
|
||||
blockND[i] = shape[i] * blockND[i+1];
|
||||
}
|
||||
return blockND;
|
||||
}
|
||||
|
||||
const uint8_t* ov::intel_cpu::InterpolateExecutor::padPreprocess(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst) {
|
||||
const uint8_t *src_data_origin = reinterpret_cast<uint8_t*>(src[0]->GetData());
|
||||
|
||||
const auto &srcDim = src[0]->getStaticDims();
|
||||
const auto &dstDim = dst[0]->getStaticDims();
|
||||
size_t dimSize = srcDim.size();
|
||||
auto srcDimPad = getSrcDimPad5d();
|
||||
|
||||
const auto srcDim5d = to5Dim(srcDim);
|
||||
const auto srcDimPad5d = to5Dim(srcDimPad);
|
||||
const auto dstDim5d = to5Dim(dstDim);
|
||||
const auto srcDataSize = src[0]->getDesc().getPrecision().size();
|
||||
|
||||
const uint8_t *src_data = nullptr;
|
||||
std::vector<uint8_t> srcPadded;
|
||||
if (interpAttrs.hasPad) {
|
||||
int padB0 = (dimSize > 2) ? interpAttrs.padBegin[0] : 0;
|
||||
int padB1 = (dimSize > 2) ? interpAttrs.padBegin[1] : 0;
|
||||
int padB2 = (dimSize == 5) ? interpAttrs.padBegin[dimSize - 3] : 0;
|
||||
int padB3 = interpAttrs.padBegin[dimSize - 2];
|
||||
int padB4 = interpAttrs.padBegin[dimSize - 1];
|
||||
|
||||
SizeVector inShapeBlock = getBlockND(srcDim5d);
|
||||
SizeVector inShapePadBlock = getBlockND(srcDimPad5d);
|
||||
|
||||
if (interpAttrs.layout == InterpolateLayoutType::planar) {
|
||||
srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0);
|
||||
uint8_t *src_data_pad = static_cast<uint8_t *>(&srcPadded[0]);
|
||||
parallel_for4d(srcDim5d[0], srcDim5d[1], srcDim5d[2], srcDim5d[3], [&](int n, int c, int d, int h) {
|
||||
const uint8_t *src = src_data_origin + (inShapeBlock[1] * n + inShapeBlock[2] * c + inShapeBlock[3] * d + inShapeBlock[4] * h) * srcDataSize;
|
||||
uint8_t *srcPad = src_data_pad + (inShapePadBlock[1] * (n + padB0) + inShapePadBlock[2] * (c + padB1) +
|
||||
inShapePadBlock[3] * (d + padB2) + inShapePadBlock[4] * (h + padB3) + padB4) * srcDataSize;
|
||||
cpu_memcpy(srcPad, src, srcDim5d[4] * srcDataSize);
|
||||
});
|
||||
src_data = src_data_pad;
|
||||
} else if (interpAttrs.layout == InterpolateLayoutType::by_channel) {
|
||||
srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0);
|
||||
uint8_t *src_data_pad = static_cast<uint8_t *>(&srcPadded[0]);
|
||||
parallel_for4d(srcDim5d[0], srcDim5d[2], srcDim5d[3], srcDim5d[4], [&](int n, int d, int h, int w) {
|
||||
const uint8_t *src = src_data_origin + (inShapeBlock[1] * n +
|
||||
(inShapeBlock[3] * d + inShapeBlock[4] * h + inShapeBlock[5] * w) * srcDim5d[1]) * srcDataSize;
|
||||
uint8_t *srcPad = src_data_pad + (inShapePadBlock[1] * (n + padB0) + (inShapePadBlock[3] * (d + padB2) +
|
||||
inShapePadBlock[4] * (h + padB3) +
|
||||
inShapePadBlock[5] * (w + padB4)) * srcDimPad5d[1] + padB1) * srcDataSize;
|
||||
cpu_memcpy(srcPad, src, srcDim5d[1] * srcDataSize);
|
||||
});
|
||||
src_data = src_data_pad;
|
||||
} else if (interpAttrs.layout == InterpolateLayoutType::block) {
|
||||
size_t blkSize = dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core) ? 16 : 8;
|
||||
size_t CB = div_up(srcDimPad5d[1], blkSize);
|
||||
size_t eltsTotal = srcDimPad5d[0] * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize;
|
||||
srcPadded.resize(eltsTotal * srcDataSize, 0x0);
|
||||
uint8_t *src_data_pad = static_cast<uint8_t *>(&srcPadded[0]);
|
||||
if ((srcDim5d[0] != srcDimPad5d[0]) || (srcDim5d[1] != srcDimPad5d[1])) {
|
||||
IE_THROW() << "Interpolate layer with name does not support padding on batch and channel dimensions";
|
||||
}
|
||||
parallel_for5d(srcDim5d[0], CB, srcDim5d[2], srcDim5d[3], srcDim5d[4], [&](int n, int cb, int d, int h, int w) {
|
||||
const uint8_t *src = src_data_origin + (n * CB * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize
|
||||
+ (cb * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize
|
||||
+ (d * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize
|
||||
+ (h * srcDim5d[4] * blkSize) * srcDataSize
|
||||
+ (w * blkSize) * srcDataSize;
|
||||
uint8_t *srcPad = src_data_pad + (n * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize
|
||||
+ (cb * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize
|
||||
+ ((d + padB2) * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize
|
||||
+ ((h + padB3) * srcDimPad5d[4] * blkSize) * srcDataSize
|
||||
+ ((w + padB4) * blkSize) * srcDataSize;
|
||||
cpu_memcpy(srcPad, src, blkSize * srcDataSize);
|
||||
});
|
||||
src_data = src_data_pad;
|
||||
}
|
||||
} else {
|
||||
src_data = src_data_origin;
|
||||
}
|
||||
return src_data;
|
||||
}
|
187
src/plugins/intel_cpu/src/nodes/executors/interpolate.hpp
Normal file
187
src/plugins/intel_cpu/src/nodes/executors/interpolate.hpp
Normal file
@ -0,0 +1,187 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ie_common.h>
|
||||
#include <node.h>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#define MAX_INPUT_INTERPOLATE 8
|
||||
|
||||
using namespace InferenceEngine;
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
enum InterpolateLayoutType {
|
||||
planar,
|
||||
block,
|
||||
by_channel
|
||||
};
|
||||
|
||||
enum InterpolateMode {
|
||||
nearest,
|
||||
linear,
|
||||
linear_onnx,
|
||||
cubic
|
||||
};
|
||||
|
||||
enum InterpolateCoordTransMode {
|
||||
half_pixel,
|
||||
pytorch_half_pixel,
|
||||
asymmetric,
|
||||
tf_half_pixel_for_nn,
|
||||
align_corners
|
||||
};
|
||||
|
||||
enum class InterpolateNearestMode {
|
||||
round_prefer_floor,
|
||||
round_prefer_ceil,
|
||||
floor,
|
||||
ceil,
|
||||
simple
|
||||
};
|
||||
|
||||
enum class InterpolateShapeCalcMode {
|
||||
sizes,
|
||||
scales
|
||||
};
|
||||
|
||||
struct InterpolateAttrs {
|
||||
InterpolateMode mode = InterpolateMode::nearest;
|
||||
InterpolateCoordTransMode coordTransMode = InterpolateCoordTransMode::half_pixel;
|
||||
InterpolateNearestMode nearestMode = InterpolateNearestMode::round_prefer_floor;
|
||||
bool antialias = false;
|
||||
float cubeCoeff = -0.75;
|
||||
std::vector<int> padBegin;
|
||||
std::vector<int> padEnd;
|
||||
InferenceEngine::Precision inPrc;
|
||||
InferenceEngine::Precision outPrc;
|
||||
InterpolateLayoutType layout;
|
||||
std::vector<float> dataScales;
|
||||
bool hasPad = false;
|
||||
};
|
||||
|
||||
inline SizeVector getPaddedInputShape(const VectorDims &srcDims,
|
||||
const std::vector<int> &padBegin,
|
||||
const std::vector<int> &padEnd) {
|
||||
SizeVector paddedShape;
|
||||
int dataRank = srcDims.size();
|
||||
for (int i = 0; i < dataRank; i++) {
|
||||
paddedShape.push_back(srcDims[i] + padBegin[i] + padEnd[i]);
|
||||
}
|
||||
return paddedShape;
|
||||
}
|
||||
|
||||
inline int clipCoord(int pos, int length) {
|
||||
return std::max(static_cast<int>(0), std::min(pos, length - 1));
|
||||
}
|
||||
|
||||
inline size_t getSpatialDimsNum(const Dim rank) {
|
||||
switch (rank) {
|
||||
case 1:
|
||||
case 3:
|
||||
return 1;
|
||||
case 2:
|
||||
case 4:
|
||||
return 2;
|
||||
case 5:
|
||||
return 3;
|
||||
default:
|
||||
IE_THROW() << "Can't define number spatial";
|
||||
}
|
||||
}
|
||||
|
||||
// w/hw/ncw/nchw/ncdhw to ncdhw
|
||||
inline SizeVector to5Dim(SizeVector casesDim) {
|
||||
size_t caseSize = casesDim.size();
|
||||
SizeVector dim5(5, 1lu);
|
||||
dim5[4] = casesDim[caseSize - 1];
|
||||
if (caseSize > 1) {
|
||||
dim5[3] = casesDim[caseSize - 2];
|
||||
}
|
||||
if (caseSize > 2) {
|
||||
dim5[0] = casesDim[0];
|
||||
}
|
||||
if (caseSize > 3) {
|
||||
dim5[1] = casesDim[1];
|
||||
}
|
||||
if (caseSize > 4) {
|
||||
dim5[2] = casesDim[2];
|
||||
}
|
||||
if (caseSize == 3) { // nhw -> ncw
|
||||
dim5[1] = dim5[3];
|
||||
dim5[3] = 1lu;
|
||||
}
|
||||
return dim5;
|
||||
}
|
||||
|
||||
static inline float triangleCoeff(float x) {
|
||||
return (std::max)(0.0f, 1 - std::abs(x));
|
||||
}
|
||||
|
||||
class InterpolateExecutor {
|
||||
public:
|
||||
static constexpr size_t DATA_ID = 0;
|
||||
static constexpr size_t TARGET_SHAPE_ID = 1;
|
||||
static constexpr size_t SCALES_ID = 2;
|
||||
static constexpr size_t AXES_ID = 3;
|
||||
static constexpr int CUBIC_GRID_LEN = 4;
|
||||
InterpolateExecutor(const ExecutorContext::CPtr context) : _context(context) {}
|
||||
|
||||
virtual bool init(const InterpolateAttrs& interpolateAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs,
|
||||
const dnnl::primitive_attr &attr);
|
||||
virtual void exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, const void *post_ops_data_) = 0;
|
||||
virtual impl_desc_type getImplType() const = 0;
|
||||
|
||||
virtual ~InterpolateExecutor() = default;
|
||||
VectorDims getSrcDimPad5d() const { return srcDimPad5d; }
|
||||
const uint8_t* padPreprocess(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst);
|
||||
|
||||
private:
|
||||
void buildTblNN(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, const std::vector<float>& dataScales,
|
||||
InterpolateLayoutType layout, InterpolateNearestMode nearestMode);
|
||||
void buildTblLinearOnnx(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, const std::vector<float>& dataScales,
|
||||
InterpolateLayoutType layout);
|
||||
void buildTblLinear(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, const std::vector<float>& dataScales, int kernel_width,
|
||||
bool antialias);
|
||||
void buildTblCubic(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, const std::vector<float>& dataScales, float cubicCoeff,
|
||||
InterpolateLayoutType layout);
|
||||
|
||||
float coordTransToInput(int outCoord, float scale, int inShape, int outShape) const;
|
||||
int nearestRound(float origin, bool isDownsample, InterpolateNearestMode nearestMode) const;
|
||||
void linearOnnxCF(int outCoord, float scale, int inShape, int outShape, int& index0, int& index1, float& weight0, float& weight1);
|
||||
std::vector<float> getCubicCoeffs(float mantissa, float a);
|
||||
|
||||
protected:
|
||||
InterpolateAttrs interpAttrs;
|
||||
VectorDims srcDimPad5d, dstDim5d;
|
||||
size_t srcDataSize, dstDataSize;
|
||||
int spatialDimSize;
|
||||
size_t dataRank;
|
||||
std::vector<int> indexTable;
|
||||
const ExecutorContext::CPtr _context;
|
||||
};
|
||||
|
||||
using InterpolateExecutorPtr = std::shared_ptr<InterpolateExecutor>;
|
||||
using InterpolateExecutorCPtr = std::shared_ptr<const InterpolateExecutor>;
|
||||
|
||||
class InterpolateExecutorBuilder {
|
||||
public:
|
||||
~InterpolateExecutorBuilder() = default;
|
||||
virtual bool isSupported(const InterpolateAttrs& InterpolateAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs) const = 0;
|
||||
virtual InterpolateExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const = 0;
|
||||
};
|
||||
|
||||
using InterpolateExecutorBuilderPtr = std::shared_ptr<InterpolateExecutorBuilder>;
|
||||
using InterpolateExecutorBuilderCPtr = std::shared_ptr<const InterpolateExecutorBuilder>;
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
@ -0,0 +1,19 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "interpolate_list.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
const std::vector<InterpolateExecutorDesc>& getInterpolateExecutorsList() {
|
||||
static std::vector<InterpolateExecutorDesc> descs = {
|
||||
OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared<ACLInterpolateExecutorBuilder>())
|
||||
};
|
||||
|
||||
return descs;
|
||||
}
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
@ -0,0 +1,85 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "executor.hpp"
|
||||
|
||||
#include "interpolate.hpp"
|
||||
#if defined(OV_CPU_WITH_ACL)
|
||||
#include "acl/acl_interpolate.hpp"
|
||||
#endif
|
||||
|
||||
#include "onednn/iml_type_mapper.h"
|
||||
#include "common/primitive_cache.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
struct InterpolateExecutorDesc {
|
||||
ExecutorType executorType;
|
||||
InterpolateExecutorBuilderCPtr builder;
|
||||
};
|
||||
|
||||
const std::vector<InterpolateExecutorDesc>& getInterpolateExecutorsList();
|
||||
|
||||
class InterpolateExecutorFactory : public ExecutorFactory {
|
||||
public:
|
||||
InterpolateExecutorFactory(const InterpolateAttrs& InterpolateAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs,
|
||||
const ExecutorContext::CPtr context) : ExecutorFactory(context) {
|
||||
for (auto& desc : getInterpolateExecutorsList()) {
|
||||
if (desc.builder->isSupported(InterpolateAttrs, srcDescs, dstDescs)) {
|
||||
supportedDescs.push_back(desc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
~InterpolateExecutorFactory() = default;
|
||||
virtual InterpolateExecutorPtr makeExecutor(const InterpolateAttrs& interpolateAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs,
|
||||
const dnnl::primitive_attr &attr) {
|
||||
auto build = [&](const InterpolateExecutorDesc* desc) {
|
||||
auto executor = desc->builder->makeExecutor(context);
|
||||
if (executor->init(interpolateAttrs, srcDescs, dstDescs, attr)) {
|
||||
return executor;
|
||||
}
|
||||
|
||||
InterpolateExecutorPtr ptr = nullptr;
|
||||
return ptr;
|
||||
};
|
||||
|
||||
|
||||
if (chosenDesc) {
|
||||
if (auto executor = build(chosenDesc)) {
|
||||
return executor;
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto& sd : supportedDescs) {
|
||||
if (auto executor = build(&sd)) {
|
||||
chosenDesc = &sd;
|
||||
return executor;
|
||||
}
|
||||
}
|
||||
|
||||
IE_THROW() << "Supported Interpolate executor is not found";
|
||||
}
|
||||
|
||||
bool isEmpty() {
|
||||
return supportedDescs.empty();
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<InterpolateExecutorDesc> supportedDescs;
|
||||
const InterpolateExecutorDesc* chosenDesc = nullptr;
|
||||
};
|
||||
|
||||
using InterpolateExecutorFactoryPtr = std::shared_ptr<InterpolateExecutorFactory>;
|
||||
using InterpolateExecutorFactoryCPtr = std::shared_ptr<const InterpolateExecutorFactory>;
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
38
src/plugins/intel_cpu/src/nodes/executors/mvn.cpp
Normal file
38
src/plugins/intel_cpu/src/nodes/executors/mvn.cpp
Normal file
@ -0,0 +1,38 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "mvn.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
using namespace InferenceEngine;
|
||||
|
||||
MVNExecutor::MVNExecutor(const ExecutorContext::CPtr context) : context(context) {}
|
||||
|
||||
SizeVector MVNExecutor::transformTo5DCase(const SizeVector& shape, bool initAcrossChannels) {
|
||||
switch (shape.size()) {
|
||||
// for 1 and 2 rank, if initAcrossChannels_ is true, adjust shape to fully vectorize under unified 5d procedure.
|
||||
// otherwise there are not enough data in spatial dimension to process in one kernel.
|
||||
case 1 : // C
|
||||
if (initAcrossChannels) {
|
||||
return SizeVector({1, 1, 1, 1, shape[0]});
|
||||
} else {
|
||||
return SizeVector({1, shape[0], 1, 1, 1});
|
||||
}
|
||||
case 2 : // NC
|
||||
if (initAcrossChannels) {
|
||||
return SizeVector({1, shape[0], 1, shape[1], 1});
|
||||
} else {
|
||||
return SizeVector({shape[0], shape[1], 1, 1, 1});
|
||||
}
|
||||
case 3 : { return SizeVector({shape[0], shape[1], 1, shape[2], 1}); }
|
||||
case 4 : { return SizeVector({shape[0], shape[1], 1, shape[2], shape[3]}); }
|
||||
case 5 : { return SizeVector({shape[0], shape[1], shape[2], shape[3], shape[4]}); }
|
||||
default : { IE_THROW() << "MVN executor doesn't support planar layout with rank: " << shape.size(); }
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
72
src/plugins/intel_cpu/src/nodes/executors/mvn.hpp
Normal file
72
src/plugins/intel_cpu/src/nodes/executors/mvn.hpp
Normal file
@ -0,0 +1,72 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "cpu_memory.h"
|
||||
#include "onednn/iml_type_mapper.h"
|
||||
#include "executor.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
enum MVNLayoutType {
|
||||
mvn_planar,
|
||||
mvn_block,
|
||||
mvn_by_channel
|
||||
};
|
||||
|
||||
// Defines way to add epsilon: inside sqrt or outside.
|
||||
enum MVNEpsMode {
|
||||
INSIDE_SQRT,
|
||||
OUTSIDE_SQRT
|
||||
};
|
||||
|
||||
struct MVNAttrs {
|
||||
MVNLayoutType layout;
|
||||
std::tuple<size_t, size_t, size_t, size_t, size_t> shape5D;
|
||||
bool initAcrossChannels_;
|
||||
bool execAcrossChannels_;
|
||||
bool normalizeVariance_;
|
||||
float epsValue_;
|
||||
MVNEpsMode epsMode_;
|
||||
InferenceEngine::Precision src_prc;
|
||||
InferenceEngine::Precision dst_prc;
|
||||
};
|
||||
|
||||
class MVNExecutor {
|
||||
public:
|
||||
MVNExecutor(const ExecutorContext::CPtr context);
|
||||
virtual bool init(const MVNAttrs& mvnAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs,
|
||||
const dnnl::primitive_attr &attr) = 0;
|
||||
|
||||
virtual void exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, const void *post_ops_data_) = 0;
|
||||
virtual ~MVNExecutor() = default;
|
||||
|
||||
virtual impl_desc_type getImplType() const = 0;
|
||||
|
||||
static InferenceEngine::SizeVector transformTo5DCase(const InferenceEngine::SizeVector& shape, bool initAcrossChannels);
|
||||
|
||||
protected:
|
||||
MVNAttrs mvnAttrs;
|
||||
const ExecutorContext::CPtr context;
|
||||
};
|
||||
|
||||
using MVNExecutorPtr = std::shared_ptr<MVNExecutor>;
|
||||
using MVNExecutorCPtr = std::shared_ptr<const MVNExecutor>;
|
||||
|
||||
class MVNExecutorBuilder {
|
||||
public:
|
||||
~MVNExecutorBuilder() = default;
|
||||
virtual bool isSupported(const MVNAttrs& mvnAttrs, const std::vector<MemoryDescPtr>& srcDescs, const std::vector<MemoryDescPtr>& dstDescs) const = 0;
|
||||
virtual MVNExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const = 0;
|
||||
};
|
||||
|
||||
using MVNExecutorBuilderPtr = std::shared_ptr<MVNExecutorBuilder>;
|
||||
using MVNExecutorBuilderCPtr = std::shared_ptr<const MVNExecutorBuilder>;
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
19
src/plugins/intel_cpu/src/nodes/executors/mvn_list.cpp
Normal file
19
src/plugins/intel_cpu/src/nodes/executors/mvn_list.cpp
Normal file
@ -0,0 +1,19 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "mvn_list.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
const std::vector<MVNExecutorDesc>& getMVNExecutorsList() {
|
||||
static std::vector<MVNExecutorDesc> descs = {
|
||||
OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared<AclMVNExecutorBuilder>())
|
||||
};
|
||||
|
||||
return descs;
|
||||
}
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
84
src/plugins/intel_cpu/src/nodes/executors/mvn_list.hpp
Normal file
84
src/plugins/intel_cpu/src/nodes/executors/mvn_list.hpp
Normal file
@ -0,0 +1,84 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "executor.hpp"
|
||||
|
||||
#include "mvn.hpp"
|
||||
#if defined(OV_CPU_WITH_ACL)
|
||||
#include "acl/acl_mvn.hpp"
|
||||
#endif
|
||||
|
||||
#include "onednn/iml_type_mapper.h"
|
||||
#include "common/primitive_cache.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
struct MVNExecutorDesc {
|
||||
ExecutorType executorType;
|
||||
MVNExecutorBuilderCPtr builder;
|
||||
};
|
||||
|
||||
const std::vector<MVNExecutorDesc>& getMVNExecutorsList();
|
||||
|
||||
class MVNExecutorFactory : public ExecutorFactory {
|
||||
public:
|
||||
MVNExecutorFactory(const MVNAttrs& mvnAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs,
|
||||
const ExecutorContext::CPtr context) : ExecutorFactory(context) {
|
||||
for (auto& desc : getMVNExecutorsList()) {
|
||||
if (desc.builder->isSupported(mvnAttrs, srcDescs, dstDescs)) {
|
||||
supportedDescs.push_back(desc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
~MVNExecutorFactory() = default;
|
||||
virtual MVNExecutorPtr makeExecutor(const MVNAttrs& mvnAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs,
|
||||
const dnnl::primitive_attr &attr) {
|
||||
auto build = [&](const MVNExecutorDesc* desc) {
|
||||
auto executor = desc->builder->makeExecutor(context);
|
||||
if (executor->init(mvnAttrs, srcDescs, dstDescs, attr)) {
|
||||
return executor;
|
||||
}
|
||||
|
||||
MVNExecutorPtr ptr = nullptr;
|
||||
return ptr;
|
||||
};
|
||||
|
||||
if (chosenDesc) {
|
||||
if (auto executor = build(chosenDesc)) {
|
||||
return executor;
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto& sd : supportedDescs) {
|
||||
if (auto executor = build(&sd)) {
|
||||
chosenDesc = &sd;
|
||||
return executor;
|
||||
}
|
||||
}
|
||||
|
||||
IE_THROW() << "Supported MVN executor is not found";
|
||||
}
|
||||
|
||||
bool isEmpty() {
|
||||
return supportedDescs.empty();
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<MVNExecutorDesc> supportedDescs;
|
||||
const MVNExecutorDesc* chosenDesc = nullptr;
|
||||
};
|
||||
|
||||
using MVNExecutorFactoryPtr = std::shared_ptr<MVNExecutorFactory>;
|
||||
using MVNExecutorFactoryCPtr = std::shared_ptr<const MVNExecutorFactory>;
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
15
src/plugins/intel_cpu/src/nodes/executors/pooling.cpp
Normal file
15
src/plugins/intel_cpu/src/nodes/executors/pooling.cpp
Normal file
@ -0,0 +1,15 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "pooling.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
using namespace InferenceEngine;
|
||||
|
||||
PoolingExecutor::PoolingExecutor(const ExecutorContext::CPtr context) : context(context) {}
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
75
src/plugins/intel_cpu/src/nodes/executors/pooling.hpp
Normal file
75
src/plugins/intel_cpu/src/nodes/executors/pooling.hpp
Normal file
@ -0,0 +1,75 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "cpu_memory.h"
|
||||
#include "onednn/iml_type_mapper.h"
|
||||
#include "executor.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
struct PoolingAttrs {
|
||||
bool exclude_pad = false;
|
||||
bool auto_pad = false;
|
||||
|
||||
op::PadType pad_type;
|
||||
Algorithm algorithm;
|
||||
|
||||
op::RoundingType rounding;
|
||||
|
||||
std::vector<ptrdiff_t> stride;
|
||||
std::vector<ptrdiff_t> kernel;
|
||||
std::vector<ptrdiff_t> dilation;
|
||||
|
||||
std::vector<ptrdiff_t> data_pad_begin;
|
||||
std::vector<ptrdiff_t> data_pad_end;
|
||||
|
||||
/// Effective padding. Used to define correct output shape by oneDNN
|
||||
/// reshape formula: (iw - kernel + pad_l + pad_r) / strides[i - 2] + 1
|
||||
/// should be passed into pooling desc constructor.
|
||||
std::vector<ptrdiff_t> effective_pad_begin;
|
||||
std::vector<ptrdiff_t> effective_pad_end;
|
||||
|
||||
/// Effective dilation. Used to define correct dilation for OneDNN.
|
||||
/// For OneDNN default dilation is vector of zero
|
||||
std::vector<ptrdiff_t> effective_dilation;
|
||||
};
|
||||
|
||||
class PoolingExecutor {
|
||||
public:
|
||||
PoolingExecutor(const ExecutorContext::CPtr context);
|
||||
virtual bool init(const PoolingAttrs& poolingAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs,
|
||||
const dnnl::primitive_attr &attr) = 0;
|
||||
|
||||
virtual void exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, std::unordered_map<int, MemoryPtr> postOpsArgs) = 0;
|
||||
virtual ~PoolingExecutor() = default;
|
||||
|
||||
virtual impl_desc_type getImplType() const = 0;
|
||||
|
||||
protected:
|
||||
PoolingAttrs poolingAttrs;
|
||||
const ExecutorContext::CPtr context;
|
||||
};
|
||||
|
||||
using PoolingExecutorPtr = std::shared_ptr<PoolingExecutor>;
|
||||
using PoolingExecutorCPtr = std::shared_ptr<const PoolingExecutor>;
|
||||
|
||||
class PoolingExecutorBuilder {
|
||||
public:
|
||||
~PoolingExecutorBuilder() = default;
|
||||
virtual bool isSupported(const PoolingAttrs& poolingAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs) const = 0;
|
||||
virtual PoolingExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const = 0;
|
||||
};
|
||||
|
||||
using PoolingExecutorBuilderPtr = std::shared_ptr<PoolingExecutorBuilder>;
|
||||
using PoolingExecutorBuilderCPtr = std::shared_ptr<const PoolingExecutorBuilder>;
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
19
src/plugins/intel_cpu/src/nodes/executors/pooling_list.cpp
Normal file
19
src/plugins/intel_cpu/src/nodes/executors/pooling_list.cpp
Normal file
@ -0,0 +1,19 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "pooling_list.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
const std::vector<PoolingExecutorDesc>& getPoolingExecutorsList() {
|
||||
static std::vector<PoolingExecutorDesc> descs = {
|
||||
OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared<AclPoolingExecutorBuilder>())
|
||||
};
|
||||
|
||||
return descs;
|
||||
}
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
78
src/plugins/intel_cpu/src/nodes/executors/pooling_list.hpp
Normal file
78
src/plugins/intel_cpu/src/nodes/executors/pooling_list.hpp
Normal file
@ -0,0 +1,78 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "executor.hpp"
|
||||
|
||||
#include "pooling.hpp"
|
||||
#if defined(OV_CPU_WITH_ACL)
|
||||
#include "acl/acl_pooling.hpp"
|
||||
#endif
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
struct PoolingExecutorDesc {
|
||||
ExecutorType executorType;
|
||||
PoolingExecutorBuilderCPtr builder;
|
||||
};
|
||||
|
||||
const std::vector<PoolingExecutorDesc>& getPoolingExecutorsList();
|
||||
|
||||
class PoolingExecutorFactory : public ExecutorFactory {
|
||||
public:
|
||||
PoolingExecutorFactory(const PoolingAttrs& poolingAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs,
|
||||
const ExecutorContext::CPtr context) : ExecutorFactory(context) {
|
||||
for (auto& desc : getPoolingExecutorsList()) {
|
||||
if (desc.builder->isSupported(poolingAttrs, srcDescs, dstDescs)) {
|
||||
supportedDescs.push_back(desc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
~PoolingExecutorFactory() = default;
|
||||
virtual PoolingExecutorPtr makeExecutor(const PoolingAttrs& poolingAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs,
|
||||
const dnnl::primitive_attr &attr) {
|
||||
auto build = [&](const PoolingExecutorDesc* desc) {
|
||||
auto executor = desc->builder->makeExecutor(context);
|
||||
if (executor->init(poolingAttrs, srcDescs, dstDescs, attr)) {
|
||||
return executor;
|
||||
}
|
||||
|
||||
PoolingExecutorPtr ptr = nullptr;
|
||||
return ptr;
|
||||
};
|
||||
|
||||
|
||||
if (chosenDesc) {
|
||||
if (auto executor = build(chosenDesc)) {
|
||||
return executor;
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto& sd : supportedDescs) {
|
||||
if (auto executor = build(&sd)) {
|
||||
chosenDesc = &sd;
|
||||
return executor;
|
||||
}
|
||||
}
|
||||
|
||||
IE_THROW() << "Supported Pooling executor is not found";
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<PoolingExecutorDesc> supportedDescs;
|
||||
const PoolingExecutorDesc* chosenDesc = nullptr;
|
||||
};
|
||||
|
||||
using PoolingExecutorFactoryPtr = std::shared_ptr<PoolingExecutorFactory>;
|
||||
using PoolingExecutorFactoryCPtr = std::shared_ptr<const PoolingExecutorFactory>;
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
15
src/plugins/intel_cpu/src/nodes/executors/reduce.cpp
Normal file
15
src/plugins/intel_cpu/src/nodes/executors/reduce.cpp
Normal file
@ -0,0 +1,15 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "reduce.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
using namespace InferenceEngine;
|
||||
|
||||
ReduceExecutor::ReduceExecutor(const ExecutorContext::CPtr context) : context(context) {}
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
55
src/plugins/intel_cpu/src/nodes/executors/reduce.hpp
Normal file
55
src/plugins/intel_cpu/src/nodes/executors/reduce.hpp
Normal file
@ -0,0 +1,55 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "cpu_memory.h"
|
||||
#include "onednn/iml_type_mapper.h"
|
||||
#include "dnnl_scratch_pad.h"
|
||||
#include "executor.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
struct ReduceAttrs {
|
||||
std::vector<int> axes;
|
||||
Algorithm operation;
|
||||
bool keepDims;
|
||||
};
|
||||
|
||||
class ReduceExecutor {
|
||||
public:
|
||||
ReduceExecutor(const ExecutorContext::CPtr context);
|
||||
virtual bool init(const ReduceAttrs& reduceAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs,
|
||||
const dnnl::primitive_attr &attr) = 0;
|
||||
|
||||
virtual void exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, const void *post_ops_data_) = 0;
|
||||
virtual ~ReduceExecutor() = default;
|
||||
|
||||
virtual impl_desc_type getImplType() const = 0;
|
||||
|
||||
protected:
|
||||
ReduceAttrs reduceAttrs;
|
||||
const ExecutorContext::CPtr context;
|
||||
};
|
||||
|
||||
using ReduceExecutorPtr = std::shared_ptr<ReduceExecutor>;
|
||||
using ReduceExecutorCPtr = std::shared_ptr<const ReduceExecutor>;
|
||||
|
||||
class ReduceExecutorBuilder {
|
||||
public:
|
||||
~ReduceExecutorBuilder() = default;
|
||||
virtual bool isSupported(const ReduceAttrs& reduceAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs) const = 0;
|
||||
virtual ReduceExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const = 0;
|
||||
};
|
||||
|
||||
using ReduceExecutorBuilderPtr = std::shared_ptr<ReduceExecutorBuilder>;
|
||||
using ReduceExecutorBuilderCPtr = std::shared_ptr<const ReduceExecutorBuilder>;
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
19
src/plugins/intel_cpu/src/nodes/executors/reduce_list.cpp
Normal file
19
src/plugins/intel_cpu/src/nodes/executors/reduce_list.cpp
Normal file
@ -0,0 +1,19 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "reduce_list.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
const std::vector<ReduceExecutorDesc>& getReduceExecutorsList() {
|
||||
static std::vector<ReduceExecutorDesc> descs = {
|
||||
OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared<AclReduceExecutorBuilder>())
|
||||
};
|
||||
|
||||
return descs;
|
||||
}
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
85
src/plugins/intel_cpu/src/nodes/executors/reduce_list.hpp
Normal file
85
src/plugins/intel_cpu/src/nodes/executors/reduce_list.hpp
Normal file
@ -0,0 +1,85 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "executor.hpp"
|
||||
|
||||
#include "reduce.hpp"
|
||||
#if defined(OV_CPU_WITH_ACL)
|
||||
#include "acl/acl_reduce.hpp"
|
||||
#endif
|
||||
|
||||
#include "onednn/iml_type_mapper.h"
|
||||
#include "common/primitive_cache.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
struct ReduceExecutorDesc {
|
||||
ExecutorType executorType;
|
||||
ReduceExecutorBuilderCPtr builder;
|
||||
};
|
||||
|
||||
const std::vector<ReduceExecutorDesc>& getReduceExecutorsList();
|
||||
|
||||
class ReduceExecutorFactory : public ExecutorFactory {
|
||||
public:
|
||||
ReduceExecutorFactory(const ReduceAttrs& reduceAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs,
|
||||
const ExecutorContext::CPtr context) : ExecutorFactory(context) {
|
||||
for (auto& desc : getReduceExecutorsList()) {
|
||||
if (desc.builder->isSupported(reduceAttrs, srcDescs, dstDescs)) {
|
||||
supportedDescs.push_back(desc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
~ReduceExecutorFactory() = default;
|
||||
virtual ReduceExecutorPtr makeExecutor(const ReduceAttrs& reduceAttrs,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs,
|
||||
const dnnl::primitive_attr &attr) {
|
||||
auto build = [&](const ReduceExecutorDesc* desc) {
|
||||
auto executor = desc->builder->makeExecutor(context);
|
||||
if (executor->init(reduceAttrs, srcDescs, dstDescs, attr)) {
|
||||
return executor;
|
||||
}
|
||||
|
||||
ReduceExecutorPtr ptr = nullptr;
|
||||
return ptr;
|
||||
};
|
||||
|
||||
|
||||
if (chosenDesc) {
|
||||
if (auto executor = build(chosenDesc)) {
|
||||
return executor;
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto& sd : supportedDescs) {
|
||||
if (auto executor = build(&sd)) {
|
||||
chosenDesc = &sd;
|
||||
return executor;
|
||||
}
|
||||
}
|
||||
|
||||
IE_THROW() << "Supported Reduce executor is not found";
|
||||
}
|
||||
|
||||
bool isEmpty() {
|
||||
return supportedDescs.empty();
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<ReduceExecutorDesc> supportedDescs;
|
||||
const ReduceExecutorDesc* chosenDesc = nullptr;
|
||||
};
|
||||
|
||||
using ReduceExecutorFactoryPtr = std::shared_ptr<ReduceExecutorFactory>;
|
||||
using ReduceExecutorFactoryCPtr = std::shared_ptr<const ReduceExecutorFactory>;
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
@ -25,7 +25,7 @@ using namespace Xbyak;
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
namespace node {
|
||||
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
#define GET_OFF(field) offsetof(jit_extract_image_patches_args, field)
|
||||
|
||||
template <cpu_isa_t isa>
|
||||
@ -270,6 +270,7 @@ private:
|
||||
dd(i * jpp.SW * jpp.dtype_size);
|
||||
}
|
||||
};
|
||||
#endif // OPENVINO_ARCH_X86_64
|
||||
|
||||
bool ExtractImagePatches::isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
|
||||
try {
|
||||
@ -481,6 +482,7 @@ void ExtractImagePatches::ExtractImagePatchesRefExecutor::executeReference(
|
||||
|
||||
void ExtractImagePatches::ExtractImagePatchesJitExecutor::executeOptimizedGeneric(
|
||||
void* src, void* dst, const VectorDims& istrides, const VectorDims& ostrides) const {
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
const char* src_data = reinterpret_cast<const char*>(src);
|
||||
char* dst_data = reinterpret_cast<char*>(dst);
|
||||
const auto& jpp = pKernel->jpp;
|
||||
@ -507,6 +509,7 @@ void ExtractImagePatches::ExtractImagePatchesJitExecutor::executeOptimizedGeneri
|
||||
args.w_hi_pad = iw_hpad;
|
||||
(*pKernel)(&args);
|
||||
});
|
||||
#endif // OPENVINO_ARCH_X86_64
|
||||
}
|
||||
|
||||
jit_extract_image_patches_params ExtractImagePatches::ExtractImagePatchesExecutor::fillJpp(
|
||||
@ -585,6 +588,7 @@ ExtractImagePatches::ExtractImagePatchesJitExecutor::ExtractImagePatchesJitExecu
|
||||
const VectorDims& rates,
|
||||
const ExtImgPatcherPadType& padType,
|
||||
const size_t prcSize) {
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
auto jpp = fillJpp(inDims, outDims, kSizes, strides, rates, padType, prcSize);
|
||||
if (mayiuse(x64::avx512_core)) {
|
||||
pKernel.reset(new jit_extract_image_patches_kernel<x64::avx512_core>(jpp));
|
||||
@ -598,6 +602,7 @@ ExtractImagePatches::ExtractImagePatchesJitExecutor::ExtractImagePatchesJitExecu
|
||||
|
||||
if (pKernel)
|
||||
pKernel->create_ker();
|
||||
#endif // OPENVINO_ARCH_X86_64
|
||||
}
|
||||
|
||||
void ExtractImagePatches::ExtractImagePatchesJitExecutor::exec(
|
||||
|
@ -45,7 +45,7 @@ using namespace Xbyak;
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
namespace node {
|
||||
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
#define GET_OFF(field) offsetof(jit_quantize_call_args, field)
|
||||
|
||||
template <cpu_isa_t isa>
|
||||
@ -228,7 +228,7 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_
|
||||
};
|
||||
|
||||
void generate() override {
|
||||
do_dequantization = jqp_.op_type == Algorithm::FQCommon || jqp_.op_type == Algorithm::FQRequantization;
|
||||
do_dequantization = jqp_.op_type == Algorithm::FQCommon;
|
||||
do_rounding = do_dequantization || jqp_.dst_prc == Precision::FP32;
|
||||
|
||||
this->preamble();
|
||||
@ -863,7 +863,7 @@ private:
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
bool FakeQuantize::isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
|
||||
try {
|
||||
const auto fq = std::dynamic_pointer_cast<const ngraph::opset1::FakeQuantize>(op);
|
||||
@ -1236,8 +1236,7 @@ FakeQuantize::FakeQuantize(const std::shared_ptr<ngraph::Node>& op, const GraphC
|
||||
}
|
||||
}
|
||||
|
||||
algorithm = quantizationOnly ? Algorithm::FQQuantization :
|
||||
(isFakeQuantization || isFakeQuantizationWithScale) ? Algorithm::FQCommon : Algorithm::FQRequantization;
|
||||
algorithm = quantizationOnly ? Algorithm::FQQuantization : Algorithm::FQCommon;
|
||||
}
|
||||
} else {
|
||||
IE_THROW(NotImplemented) << errorMessage;
|
||||
@ -1326,7 +1325,6 @@ void FakeQuantize::initSupportedPrimitiveDescriptors() {
|
||||
} else {
|
||||
impl_type = impl_desc_type::ref;
|
||||
}
|
||||
|
||||
if (!mayiuse(cpu::x64::sse41) || getAxis() != 1) {
|
||||
impl_type = impl_desc_type::ref;
|
||||
|
||||
@ -1597,8 +1595,8 @@ void FakeQuantize::executeReference() {
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void FakeQuantize::executeBinarization(const std::unique_ptr<jit_uni_quantize_kernel> &pKernel) const {
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
const auto &srcMemory = getParentEdgeAt(0)->getMemoryPtr();
|
||||
auto &dstMemory = getChildEdgeAt(0)->getMemoryPtr();
|
||||
|
||||
@ -1636,9 +1634,11 @@ void FakeQuantize::executeBinarization(const std::unique_ptr<jit_uni_quantize_ke
|
||||
|
||||
(*pKernel)(&arg);
|
||||
});
|
||||
#endif
|
||||
}
|
||||
|
||||
void FakeQuantize::executeQuantization(const std::unique_ptr<jit_uni_quantize_kernel> &pKernel) const {
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
auto &srcMemory = getParentEdgeAt(0)->getMemoryPtr();
|
||||
auto &dstMemory = getChildEdgeAt(0)->getMemoryPtr();
|
||||
|
||||
@ -1761,6 +1761,7 @@ void FakeQuantize::executeQuantization(const std::unique_ptr<jit_uni_quantize_ke
|
||||
(*pKernel)(&arg);
|
||||
});
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void FakeQuantize::executeDynamicImpl(dnnl::stream strm) {
|
||||
@ -2111,6 +2112,7 @@ bool FakeQuantize::appendAttrPostOps(DnnlPostOpsComposer& dnnlpoc,
|
||||
}
|
||||
|
||||
FakeQuantize::FakeQuantizeJitExecutor::FakeQuantizeJitExecutor(const jit_quantize_params &_jqp) {
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
bool isBinarization = _jqp.op_type == Algorithm::FQBinarization;
|
||||
if (mayiuse(cpu::x64::avx512_core)) {
|
||||
if (isBinarization)
|
||||
@ -2133,6 +2135,7 @@ FakeQuantize::FakeQuantizeJitExecutor::FakeQuantizeJitExecutor(const jit_quantiz
|
||||
if (pKernel) {
|
||||
pKernel->create_ker();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void FakeQuantize::FakeQuantizeJitExecutor::exec(const FakeQuantize& node) {
|
||||
|
@ -166,13 +166,11 @@ private:
|
||||
};
|
||||
using executorPtr = std::shared_ptr<FakeQuantizeExecutor>;
|
||||
executorPtr execPtr = nullptr;
|
||||
|
||||
struct FakeQuantizeJitExecutor : public FakeQuantizeExecutor {
|
||||
FakeQuantizeJitExecutor(const jit_quantize_params &_jqp);
|
||||
void exec(const FakeQuantize& node) override;
|
||||
std::unique_ptr<jit_uni_quantize_kernel> pKernel;
|
||||
};
|
||||
|
||||
void init() override;
|
||||
std::vector<LayoutType> getDataFormats() const;
|
||||
void initializePostOpData(const VectorDims &postOpDims, const size_t bufferAlignment, bool doRounding);
|
||||
|
@ -9,7 +9,7 @@
|
||||
#include "fake_quantize.h"
|
||||
#include "input.h"
|
||||
#include "reorder.h"
|
||||
#include "ngraph_transformations/op/fully_connected.hpp"
|
||||
#include "transformations/cpu_opset/common/op/fully_connected.hpp"
|
||||
#include "ngraph/opsets/opset1.hpp"
|
||||
#include "dnnl_extension_utils.h"
|
||||
#include "onednn/dnnl.h"
|
||||
|
@ -10,7 +10,7 @@
|
||||
#include <ngraph/opsets/opset1.hpp>
|
||||
#include "common/cpu_memcpy.h"
|
||||
#include <utils/general_utils.h>
|
||||
#include "kernels/gather_uni_kernel.hpp"
|
||||
#include "kernels/x64/gather_uni_kernel.hpp"
|
||||
#include "utils/shape_inference/shape_inference_cpu.hpp"
|
||||
|
||||
using namespace InferenceEngine;
|
||||
@ -205,6 +205,7 @@ void Gather::initSupportedPrimitiveDescriptors() {
|
||||
}
|
||||
|
||||
void Gather::createPrimitive() {
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
uint64_t idxElPerVec = 1;
|
||||
if (!isDynamicNode()) {
|
||||
idxElPerVec = x64::mayiuse(x64::avx512_core) ? x64::cpu_isa_traits<x64::avx512_core>::vlen / idxTypeSize :
|
||||
@ -269,7 +270,7 @@ void Gather::createPrimitive() {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
Node::createPrimitive();
|
||||
}
|
||||
|
||||
@ -323,6 +324,7 @@ void Gather::prepareParams() {
|
||||
totalWork = beforeBatchSize * betweenBatchAndAxisSize * specIndicesSize * afterAxisSize;
|
||||
}
|
||||
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
const auto& selectedPD = getSelectedPrimitiveDescriptor();
|
||||
if (jitKernel && jitKernel->isSupportedConfiguration(afterAxisSize)) {
|
||||
if (x64::mayiuse(x64::avx512_core)) {
|
||||
@ -330,12 +332,12 @@ void Gather::prepareParams() {
|
||||
} else if (x64::mayiuse(x64::avx2)) {
|
||||
selectedPD->setImplementationType(jit_avx2);
|
||||
}
|
||||
} else {
|
||||
selectedPD->setImplementationType(ref_any);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void Gather::execute(dnnl::stream strm) {
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
if (jitKernel && jitKernel->isSupportedConfiguration(afterAxisSize)) {
|
||||
const void* srcIndices = getParentEdgeAt(GATHER_INDICES)->getMemoryPtr()->GetPtr();
|
||||
const void* srcData = getParentEdgeAt(GATHER_DATA)->getMemoryPtr()->GetPtr();
|
||||
@ -383,12 +385,15 @@ void Gather::execute(dnnl::stream strm) {
|
||||
};
|
||||
|
||||
parallel_nt(0, threadBody);
|
||||
} else {
|
||||
execReference();
|
||||
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
execReference();
|
||||
}
|
||||
|
||||
void Gather::executeDynamicImpl(dnnl::stream strm) {
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
if (jitKernel && jitKernel->isSupportedConfiguration(afterAxisSize)) {
|
||||
const void* srcIndices = getParentEdgeAt(GATHER_INDICES)->getMemoryPtr()->GetPtr();
|
||||
const void* srcData = getParentEdgeAt(GATHER_DATA)->getMemoryPtr()->GetPtr();
|
||||
@ -442,9 +447,11 @@ void Gather::executeDynamicImpl(dnnl::stream strm) {
|
||||
};
|
||||
|
||||
parallel_nt(0, threadBody);
|
||||
} else {
|
||||
execReference();
|
||||
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
execReference();
|
||||
}
|
||||
|
||||
void Gather::initShortParams(threadExecParams& p, const uint64_t start) {
|
||||
|
@ -5,7 +5,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <node.h>
|
||||
#include "kernels/gather_uni_kernel.hpp"
|
||||
#include "kernels/x64/gather_uni_kernel.hpp"
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
@ -5,7 +5,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <node.h>
|
||||
#include "kernels/grid_sample.hpp"
|
||||
#include "kernels/x64/grid_sample.hpp"
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
@ -33,8 +33,9 @@ using namespace Xbyak;
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
namespace node {
|
||||
namespace {
|
||||
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
namespace {
|
||||
struct jit_has_subnormals_base : public jit_generator {
|
||||
DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_has_subnormals_base)
|
||||
|
||||
@ -229,6 +230,7 @@ jit_has_subnormals_base::fn_t jit_has_subnormals_function() {
|
||||
}
|
||||
|
||||
} // namespace
|
||||
#endif
|
||||
|
||||
Input::Input(const std::shared_ptr<ngraph::Node>& op, const GraphContext::CPtr context)
|
||||
: Node(op, context, PassThroughShapeInferFactory()) {
|
||||
@ -297,6 +299,7 @@ void Input::cloneBlobIfRequired() {
|
||||
if (!size)
|
||||
return false;
|
||||
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
if (auto fn = jit_has_subnormals_function()) {
|
||||
static const size_t batch_size = 2048;
|
||||
const size_t iterations_num = size / batch_size + 1;
|
||||
@ -318,11 +321,12 @@ void Input::cloneBlobIfRequired() {
|
||||
});
|
||||
|
||||
return has_subnormals;
|
||||
} else {
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
if (u32data[i] && (u32data[i] & (0xFF << 23)) == 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
if (u32data[i] && (u32data[i] & (0xFF << 23)) == 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -6,7 +6,7 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "ngraph_transformations/op/interaction.hpp"
|
||||
#include "transformations/cpu_opset/x64/op/interaction.hpp"
|
||||
#include "interaction.h"
|
||||
#include <onednn/dnnl.h>
|
||||
#include <dnnl_extension_utils.h>
|
||||
@ -18,8 +18,8 @@
|
||||
#include <ie_ngraph_utils.hpp>
|
||||
#include <cpu/x64/cpu_isa_traits.hpp>
|
||||
#include <cpu/x64/jit_generator.hpp>
|
||||
#include "emitters/jit_dnnl_emitters.hpp"
|
||||
#include "emitters/jit_load_store_emitters.hpp"
|
||||
#include "emitters/x64/jit_dnnl_emitters.hpp"
|
||||
#include "emitters/x64/jit_load_store_emitters.hpp"
|
||||
|
||||
using namespace InferenceEngine;
|
||||
using namespace dnnl::impl::cpu::x64;
|
||||
|
@ -20,8 +20,8 @@
|
||||
#include <cpu/x64/injectors/jit_uni_eltwise_injector.hpp>
|
||||
#include "common/cpu_memcpy.h"
|
||||
#include "utils/bfloat16.hpp"
|
||||
#include "emitters/jit_bf16_emitters.hpp"
|
||||
#include "emitters/jit_load_store_emitters.hpp"
|
||||
#include "emitters/x64/jit_bf16_emitters.hpp"
|
||||
#include "emitters/x64/jit_load_store_emitters.hpp"
|
||||
|
||||
#include <ngraph/opsets/opset1.hpp>
|
||||
#include <ngraph/opsets/opset4.hpp>
|
||||
@ -46,6 +46,8 @@ namespace ov {
|
||||
namespace intel_cpu {
|
||||
namespace node {
|
||||
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
|
||||
template <cpu_isa_t isa>
|
||||
struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, public jit_generator {
|
||||
DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_interpolate_kernel_f32)
|
||||
@ -1364,9 +1366,11 @@ private:
|
||||
}
|
||||
};
|
||||
|
||||
#endif // OPENVINO_ARCH_X86_64
|
||||
|
||||
namespace {
|
||||
struct InterpolateKey {
|
||||
Interpolate::InterpolateAttrs nodeAttrs;
|
||||
InterpolateAttrs nodeAttrs;
|
||||
VectorDims srcDims;
|
||||
VectorDims dstDims;
|
||||
std::vector<float> dataScales;
|
||||
@ -1548,7 +1552,7 @@ bool Interpolate::isSupportedOperation(const std::shared_ptr<const ngraph::Node>
|
||||
namespace {
|
||||
/**
|
||||
* Interpolate shape inference factory. It defines the input mask depending on the shape calculation mode.
|
||||
*
|
||||
*
|
||||
*/
|
||||
class InterpolateShapeInferFactory : public ShapeInferFactory {
|
||||
public:
|
||||
@ -1769,7 +1773,7 @@ void Interpolate::initSupportedPrimitiveDescriptors() {
|
||||
auto axesType = Precision::I32;
|
||||
|
||||
auto& creatorsMap = BlockedDescCreator::getCommonCreators();
|
||||
auto pushDesc = [&](LayoutType dataFormat, impl_desc_type implDetail) {
|
||||
auto pushDesc = [&](LayoutType dataFormat, impl_desc_type implDetail, bool useAclExecutor = false) {
|
||||
config.inConfs[DATA_ID].setMemDesc(creatorsMap.at(dataFormat)->createSharedDesc(inputPrecision, getInputShapeAtPort(DATA_ID)));
|
||||
config.inConfs[TARGET_SHAPE_ID].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc(targetShapeType, getInputShapeAtPort(TARGET_SHAPE_ID)));
|
||||
config.inConfs[SCALES_ID].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc(scalesType, getInputShapeAtPort(SCALES_ID)));
|
||||
@ -1778,12 +1782,39 @@ void Interpolate::initSupportedPrimitiveDescriptors() {
|
||||
config.inConfs[AXES_ID].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc(axesType, getInputShapeAtPort(AXES_ID)));
|
||||
|
||||
config.outConfs[0].setMemDesc(creatorsMap.at(dataFormat)->createSharedDesc(outputPrecision, getOutputShapeAtPort(0)));
|
||||
supportedPrimitiveDescriptors.push_back({config, implDetail});
|
||||
|
||||
if (useAclExecutor) {
|
||||
std::vector<MemoryDescPtr> srcMemoryDescs;
|
||||
for (int i = 0; i < config.inConfs.size(); i++) {
|
||||
srcMemoryDescs.push_back(config.inConfs[i].getMemDesc());
|
||||
}
|
||||
std::vector<MemoryDescPtr> dstMemoryDescs;
|
||||
for (int i = 0; i < config.outConfs.size(); i++) {
|
||||
dstMemoryDescs.push_back(config.outConfs[i].getMemDesc());
|
||||
}
|
||||
|
||||
auto factory = std::make_shared<InterpolateExecutorFactory>(interpAttrs, srcMemoryDescs, dstMemoryDescs,
|
||||
std::make_shared<ExecutorContext>(context, getPrimitivesPriority()));
|
||||
if (!factory->isEmpty()) {
|
||||
supportedPrimitiveDescriptors.push_back({config, implDetail, factory});
|
||||
}
|
||||
} else {
|
||||
supportedPrimitiveDescriptors.push_back({config, implDetail});
|
||||
}
|
||||
};
|
||||
|
||||
const auto &dataMinDims = getInputShapeAtPort(DATA_ID).getMinDims();
|
||||
bool isBlkApplied = getInputShapeAtPort(DATA_ID).getRank() > 1 && dataMinDims[1] != Shape::UNDEFINED_DIM && dataMinDims[1] > 1;
|
||||
|
||||
#if defined (OV_CPU_WITH_ACL)
|
||||
interpAttrs.hasPad = hasPad;
|
||||
pushDesc(LayoutType::nspc, undef, true);
|
||||
pushDesc(LayoutType::ncsp, undef, true);
|
||||
canUseAclExecutor = !supportedPrimitiveDescriptors.empty();
|
||||
if (canUseAclExecutor)
|
||||
return;
|
||||
#endif
|
||||
|
||||
if (!mayiuse(cpu::x64::sse41) || interpAttrs.mode == InterpolateMode::linear) {
|
||||
pushDesc(LayoutType::ncsp, ref);
|
||||
} else {
|
||||
@ -1897,11 +1928,28 @@ void Interpolate::prepareParams() {
|
||||
IE_THROW() << "Interpolate layer only supports resize on spatial dimensions(depth, height and width)";
|
||||
}
|
||||
|
||||
if (canUseAclExecutor) {
|
||||
interpAttrs.dataScales = dataScales;
|
||||
|
||||
std::vector<MemoryDescPtr> srcMemoryDescs;
|
||||
for (int i = 0; i < getParentEdges().size(); i++) {
|
||||
srcMemoryDescs.push_back(getParentEdgeAt(i)->getMemoryPtr()->getDescPtr());
|
||||
}
|
||||
std::vector<MemoryDescPtr> dstMemoryDescs;
|
||||
dstMemoryDescs.push_back(getChildEdgeAt(0)->getMemoryPtr()->getDescPtr());
|
||||
|
||||
auto selectedPD = getSelectedPrimitiveDescriptor();
|
||||
aclExecPtr = selectedPD->getExecutorFactoryAs<InterpolateExecutorFactory>()->makeExecutor(interpAttrs, srcMemoryDescs, dstMemoryDescs, {});
|
||||
selectedPD->setImplementationType(aclExecPtr->getImplType());
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
InterpolateKey key = {interpAttrs, srcDims, dstDims, dataScales, dnnl::primitive_attr()};
|
||||
setPostOps(key.attr, dstDims);
|
||||
|
||||
auto buildExecutor = [&](const InterpolateKey& key) -> std::shared_ptr<InterpolateExecutor> {
|
||||
std::shared_ptr<InterpolateExecutor> executor;
|
||||
auto buildExecutor = [&](const InterpolateKey& key) -> std::shared_ptr<InterpolateExecutorBase> {
|
||||
std::shared_ptr<InterpolateExecutorBase> executor;
|
||||
if ((key.nodeAttrs.mode == InterpolateMode::nearest || key.nodeAttrs.mode == InterpolateMode::linear_onnx ||
|
||||
key.nodeAttrs.mode == InterpolateMode::cubic) &&
|
||||
((key.nodeAttrs.layout != InterpolateLayoutType::planar && mayiuse(cpu::x64::sse41)) ||
|
||||
@ -2013,89 +2061,92 @@ std::vector<float> Interpolate::getScales(const VectorDims &srcDimPad, const Vec
|
||||
}
|
||||
|
||||
void Interpolate::execute(dnnl::stream strm) {
|
||||
if (!execPtr) {
|
||||
IE_THROW() << "Can't execute Interpolate node. Primitive didn't created";
|
||||
}
|
||||
|
||||
auto &dstMemPtr = getChildEdgeAt(0)->getMemoryPtr();
|
||||
auto &srcMemPtr = getParentEdgeAt(DATA_ID)->getMemoryPtr();
|
||||
|
||||
uint8_t *dst_data = reinterpret_cast<uint8_t*>(dstMemPtr->GetPtr());
|
||||
const uint8_t *src_data_origin = reinterpret_cast<uint8_t*>(srcMemPtr->GetData());
|
||||
if (execPtr) {
|
||||
uint8_t *dst_data = reinterpret_cast<uint8_t*>(dstMemPtr->GetPtr());
|
||||
const uint8_t *src_data_origin = reinterpret_cast<uint8_t*>(srcMemPtr->GetData());
|
||||
|
||||
const auto &srcDim = srcMemPtr->getStaticDims();
|
||||
const auto &dstDim = dstMemPtr->getStaticDims();
|
||||
size_t dimSize = srcDim.size();
|
||||
auto srcDimPad = execPtr->getSrcDimPad5d();
|
||||
const auto &srcDim = srcMemPtr->getStaticDims();
|
||||
const auto &dstDim = dstMemPtr->getStaticDims();
|
||||
size_t dimSize = srcDim.size();
|
||||
auto srcDimPad = execPtr->getSrcDimPad5d();
|
||||
|
||||
const auto srcDim5d = to5Dim(srcDim);
|
||||
const auto srcDimPad5d = to5Dim(srcDimPad);
|
||||
const auto dstDim5d = to5Dim(dstDim);
|
||||
const auto srcDataSize = srcMemPtr->getDesc().getPrecision().size();
|
||||
const auto srcDim5d = to5Dim(srcDim);
|
||||
const auto srcDimPad5d = to5Dim(srcDimPad);
|
||||
const auto dstDim5d = to5Dim(dstDim);
|
||||
const auto srcDataSize = srcMemPtr->getDesc().getPrecision().size();
|
||||
|
||||
const uint8_t *src_data = nullptr;
|
||||
std::vector<uint8_t> srcPadded;
|
||||
if (hasPad) {
|
||||
int padB0 = (dimSize > 2) ? interpAttrs.padBegin[0] : 0;
|
||||
int padB1 = (dimSize > 2) ? interpAttrs.padBegin[1] : 0;
|
||||
int padB2 = (dimSize == 5) ? interpAttrs.padBegin[dimSize - 3] : 0;
|
||||
int padB3 = interpAttrs.padBegin[dimSize - 2];
|
||||
int padB4 = interpAttrs.padBegin[dimSize - 1];
|
||||
const uint8_t *src_data = nullptr;
|
||||
std::vector<uint8_t> srcPadded;
|
||||
if (hasPad) {
|
||||
int padB0 = (dimSize > 2) ? interpAttrs.padBegin[0] : 0;
|
||||
int padB1 = (dimSize > 2) ? interpAttrs.padBegin[1] : 0;
|
||||
int padB2 = (dimSize == 5) ? interpAttrs.padBegin[dimSize - 3] : 0;
|
||||
int padB3 = interpAttrs.padBegin[dimSize - 2];
|
||||
int padB4 = interpAttrs.padBegin[dimSize - 1];
|
||||
|
||||
SizeVector inShapeBlock = getBlockND(srcDim5d);
|
||||
SizeVector inShapePadBlock = getBlockND(srcDimPad5d);
|
||||
SizeVector inShapeBlock = getBlockND(srcDim5d);
|
||||
SizeVector inShapePadBlock = getBlockND(srcDimPad5d);
|
||||
|
||||
if (interpAttrs.layout == InterpolateLayoutType::planar) {
|
||||
srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0);
|
||||
uint8_t *src_data_pad = static_cast<uint8_t *>(&srcPadded[0]);
|
||||
parallel_for4d(srcDim5d[0], srcDim5d[1], srcDim5d[2], srcDim5d[3], [&](int n, int c, int d, int h) {
|
||||
const uint8_t *src = src_data_origin + (inShapeBlock[1] * n + inShapeBlock[2] * c + inShapeBlock[3] * d + inShapeBlock[4] * h) * srcDataSize;
|
||||
uint8_t *srcPad = src_data_pad + (inShapePadBlock[1] * (n + padB0) + inShapePadBlock[2] * (c + padB1) +
|
||||
inShapePadBlock[3] * (d + padB2) + inShapePadBlock[4] * (h + padB3) + padB4) * srcDataSize;
|
||||
cpu_memcpy(srcPad, src, srcDim5d[4] * srcDataSize);
|
||||
});
|
||||
src_data = src_data_pad;
|
||||
} else if (interpAttrs.layout == InterpolateLayoutType::by_channel) {
|
||||
srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0);
|
||||
uint8_t *src_data_pad = static_cast<uint8_t *>(&srcPadded[0]);
|
||||
parallel_for4d(srcDim5d[0], srcDim5d[2], srcDim5d[3], srcDim5d[4], [&](int n, int d, int h, int w) {
|
||||
const uint8_t *src = src_data_origin + (inShapeBlock[1] * n +
|
||||
(inShapeBlock[3] * d + inShapeBlock[4] * h + inShapeBlock[5] * w) * srcDim5d[1]) * srcDataSize;
|
||||
uint8_t *srcPad = src_data_pad + (inShapePadBlock[1] * (n + padB0) + (inShapePadBlock[3] * (d + padB2) +
|
||||
inShapePadBlock[4] * (h + padB3) + inShapePadBlock[5] * (w + padB4)) * srcDimPad5d[1] + padB1) * srcDataSize;
|
||||
cpu_memcpy(srcPad, src, srcDim5d[1] * srcDataSize);
|
||||
});
|
||||
src_data = src_data_pad;
|
||||
} else if (interpAttrs.layout == InterpolateLayoutType::block) {
|
||||
size_t blkSize = mayiuse(cpu::x64::avx512_core) ? 16 : 8;
|
||||
size_t CB = div_up(srcDimPad5d[1], blkSize);
|
||||
size_t eltsTotal = srcDimPad5d[0] * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize;
|
||||
srcPadded.resize(eltsTotal * srcDataSize, 0x0);
|
||||
uint8_t *src_data_pad = static_cast<uint8_t *>(&srcPadded[0]);
|
||||
if ((srcDim5d[0] != srcDimPad5d[0]) || (srcDim5d[1] != srcDimPad5d[1])) {
|
||||
IE_THROW() << "Interpolate layer with name '" << getName() <<
|
||||
"' does not support padding on batch and channel dimensions";
|
||||
if (interpAttrs.layout == InterpolateLayoutType::planar) {
|
||||
srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0);
|
||||
uint8_t *src_data_pad = static_cast<uint8_t *>(&srcPadded[0]);
|
||||
parallel_for4d(srcDim5d[0], srcDim5d[1], srcDim5d[2], srcDim5d[3], [&](int n, int c, int d, int h) {
|
||||
const uint8_t *src = src_data_origin +
|
||||
(inShapeBlock[1] * n + inShapeBlock[2] * c + inShapeBlock[3] * d + inShapeBlock[4] * h) * srcDataSize;
|
||||
uint8_t *srcPad = src_data_pad + (inShapePadBlock[1] * (n + padB0) + inShapePadBlock[2] * (c + padB1) +
|
||||
inShapePadBlock[3] * (d + padB2) + inShapePadBlock[4] * (h + padB3) + padB4) * srcDataSize;
|
||||
cpu_memcpy(srcPad, src, srcDim5d[4] * srcDataSize);
|
||||
});
|
||||
src_data = src_data_pad;
|
||||
} else if (interpAttrs.layout == InterpolateLayoutType::by_channel) {
|
||||
srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0);
|
||||
uint8_t *src_data_pad = static_cast<uint8_t *>(&srcPadded[0]);
|
||||
parallel_for4d(srcDim5d[0], srcDim5d[2], srcDim5d[3], srcDim5d[4], [&](int n, int d, int h, int w) {
|
||||
const uint8_t *src = src_data_origin + (inShapeBlock[1] * n +
|
||||
(inShapeBlock[3] * d + inShapeBlock[4] * h + inShapeBlock[5] * w) * srcDim5d[1]) * srcDataSize;
|
||||
uint8_t *srcPad = src_data_pad + (inShapePadBlock[1] * (n + padB0) + (inShapePadBlock[3] * (d + padB2) +
|
||||
inShapePadBlock[4] * (h + padB3) + inShapePadBlock[5] * (w + padB4)) * srcDimPad5d[1] + padB1) * srcDataSize;
|
||||
cpu_memcpy(srcPad, src, srcDim5d[1] * srcDataSize);
|
||||
});
|
||||
src_data = src_data_pad;
|
||||
} else if (interpAttrs.layout == InterpolateLayoutType::block) {
|
||||
size_t blkSize = mayiuse(cpu::x64::avx512_core) ? 16 : 8;
|
||||
size_t CB = div_up(srcDimPad5d[1], blkSize);
|
||||
size_t eltsTotal = srcDimPad5d[0] * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize;
|
||||
srcPadded.resize(eltsTotal * srcDataSize, 0x0);
|
||||
uint8_t *src_data_pad = static_cast<uint8_t *>(&srcPadded[0]);
|
||||
if ((srcDim5d[0] != srcDimPad5d[0]) || (srcDim5d[1] != srcDimPad5d[1])) {
|
||||
IE_THROW() << "Interpolate layer with name '" << getName() <<
|
||||
"' does not support padding on batch and channel dimensions";
|
||||
}
|
||||
parallel_for5d(srcDim5d[0], CB, srcDim5d[2], srcDim5d[3], srcDim5d[4], [&](int n, int cb, int d, int h, int w) {
|
||||
const uint8_t *src = src_data_origin + (n * CB * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize
|
||||
+ (cb * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize
|
||||
+ (d * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize
|
||||
+ (h * srcDim5d[4] * blkSize) * srcDataSize
|
||||
+ (w * blkSize) * srcDataSize;
|
||||
uint8_t *srcPad = src_data_pad + (n * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize
|
||||
+ (cb * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize
|
||||
+ ((d + padB2) * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize
|
||||
+ ((h + padB3) * srcDimPad5d[4] * blkSize) * srcDataSize
|
||||
+ ((w + padB4) * blkSize) * srcDataSize;
|
||||
cpu_memcpy(srcPad, src, blkSize * srcDataSize);
|
||||
});
|
||||
src_data = src_data_pad;
|
||||
}
|
||||
parallel_for5d(srcDim5d[0], CB, srcDim5d[2], srcDim5d[3], srcDim5d[4], [&](int n, int cb, int d, int h, int w) {
|
||||
const uint8_t *src = src_data_origin + (n * CB * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize
|
||||
+ (cb * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize
|
||||
+ (d * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize
|
||||
+ (h * srcDim5d[4] * blkSize) * srcDataSize
|
||||
+ (w * blkSize) * srcDataSize;
|
||||
uint8_t *srcPad = src_data_pad + (n * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize
|
||||
+ (cb * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize
|
||||
+ ((d + padB2) * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize
|
||||
+ ((h + padB3) * srcDimPad5d[4] * blkSize) * srcDataSize
|
||||
+ ((w + padB4) * blkSize) * srcDataSize;
|
||||
cpu_memcpy(srcPad, src, blkSize * srcDataSize);
|
||||
});
|
||||
src_data = src_data_pad;
|
||||
} else {
|
||||
src_data = src_data_origin;
|
||||
}
|
||||
} else {
|
||||
src_data = src_data_origin;
|
||||
}
|
||||
|
||||
execPtr->exec(src_data, dst_data, postOpsDataPtrs.data());
|
||||
execPtr->exec(src_data, dst_data, postOpsDataPtrs.data());
|
||||
} else if (aclExecPtr) {
|
||||
aclExecPtr->exec({srcMemPtr}, {dstMemPtr}, postOpsDataPtrs.data());
|
||||
} else {
|
||||
IE_THROW() << "Can't execute Interpolate node. Primitive didn't created";
|
||||
}
|
||||
}
|
||||
|
||||
// for ndhwc and nCdhw8c[16c]
|
||||
@ -2369,7 +2420,7 @@ void Interpolate::InterpolateJitExecutor::cubicPlanar(const uint8_t *in_ptr_, ui
|
||||
// =====================================================================================================================
|
||||
// index layout:
|
||||
// d_0............d_OD-1, h_0..............h_OH-1, w_0................w_OW-1
|
||||
void Interpolate::InterpolateExecutor::buildTblNN(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d,
|
||||
void Interpolate::InterpolateExecutorBase::buildTblNN(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d,
|
||||
const std::vector<float>& dataScales, InterpolateLayoutType layout, InterpolateNearestMode nearestMode) {
|
||||
const int dimSize = dataRank;
|
||||
float fz = (dimSize == 5) ? dataScales[dimSize - 3] : 1.f;
|
||||
@ -2402,7 +2453,7 @@ void Interpolate::InterpolateExecutor::buildTblNN(const SizeVector& srcDimPad5d,
|
||||
// scale is float(outShape) / float(inShape)
|
||||
// strictly consistent with onnx calc manner(div scale, not multiply inverse), given this is done offline
|
||||
// the slight precison diff can produce obvious wrong value due to "nearest round" behavior for NN mode
|
||||
float Interpolate::InterpolateExecutor::coordTransToInput(int outCoord, float scale, int inShape, int outShape) const {
|
||||
float Interpolate::InterpolateExecutorBase::coordTransToInput(int outCoord, float scale, int inShape, int outShape) const {
|
||||
if (scale == 1.0f || (inShape == outShape)) {
|
||||
return outCoord;
|
||||
}
|
||||
@ -2440,7 +2491,7 @@ float Interpolate::InterpolateExecutor::coordTransToInput(int outCoord, float sc
|
||||
}
|
||||
}
|
||||
|
||||
int Interpolate::InterpolateExecutor::nearestRound(float originCoord, bool isDownsample, InterpolateNearestMode nearestMode) const {
|
||||
int Interpolate::InterpolateExecutorBase::nearestRound(float originCoord, bool isDownsample, InterpolateNearestMode nearestMode) const {
|
||||
switch (nearestMode) {
|
||||
case InterpolateNearestMode::round_prefer_floor: {
|
||||
if (originCoord == (static_cast<int>(originCoord) + 0.5f))
|
||||
@ -2474,7 +2525,7 @@ int Interpolate::InterpolateExecutor::nearestRound(float originCoord, bool isDow
|
||||
}
|
||||
}
|
||||
|
||||
void Interpolate::InterpolateExecutor::linearOnnxCF(int outCoord, float scale, int inShape, int outShape,
|
||||
void Interpolate::InterpolateExecutorBase::linearOnnxCF(int outCoord, float scale, int inShape, int outShape,
|
||||
int& index0, int& index1, float& weight0, float& weight1) {
|
||||
float inCoord = coordTransToInput(outCoord, scale, inShape, outShape);
|
||||
inCoord = std::max(0.0f, std::min(inCoord, static_cast<float>(inShape - 1)));
|
||||
@ -2489,7 +2540,7 @@ void Interpolate::InterpolateExecutor::linearOnnxCF(int outCoord, float scale, i
|
||||
}
|
||||
}
|
||||
|
||||
void Interpolate::InterpolateExecutor::buildTblLinearOnnx(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d,
|
||||
void Interpolate::InterpolateExecutorBase::buildTblLinearOnnx(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d,
|
||||
const std::vector<float>& dataScales, InterpolateLayoutType layout) {
|
||||
int dimSize = dataRank;
|
||||
float fz = (spatialDimSize > 2) ? dataScales[dimSize - 3] : 1.f;
|
||||
@ -2602,7 +2653,7 @@ void Interpolate::InterpolateExecutor::buildTblLinearOnnx(const SizeVector& srcD
|
||||
// wd .........wd, wh............wh, ww.............ww, id...........id, ih............ih, iw..............iw
|
||||
// | |
|
||||
// wh0.....wh_diameter ih0.....ih_diameter
|
||||
void Interpolate::InterpolateExecutor::buildTblLinear(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d,
|
||||
void Interpolate::InterpolateExecutorBase::buildTblLinear(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d,
|
||||
const std::vector<float>& dataScales, int kernel_width, bool antialias) {
|
||||
int dimSize = dataRank;
|
||||
float fz = (dimSize == 5) ? dataScales[dimSize - 3] : 1.f;
|
||||
@ -2679,7 +2730,7 @@ void Interpolate::InterpolateExecutor::buildTblLinear(const SizeVector& srcDimPa
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<float> Interpolate::InterpolateExecutor::getCubicCoeffs(float mantissa, float a) {
|
||||
std::vector<float> Interpolate::InterpolateExecutorBase::getCubicCoeffs(float mantissa, float a) {
|
||||
float m = std::fabs(mantissa);
|
||||
std::vector<float> coeffs(4, 0.f);
|
||||
|
||||
@ -2693,7 +2744,7 @@ std::vector<float> Interpolate::InterpolateExecutor::getCubicCoeffs(float mantis
|
||||
// table layout:
|
||||
// OW OW OW OW OW OH OH OH OH OH
|
||||
// x_idx x_weight0 x_weight1 x_weight2 x_weight3 y_idx y_weight0 y_weight1 y_weight2 y_weight3
|
||||
void Interpolate::InterpolateExecutor::buildTblCubic(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, const std::vector<float>& dataScales,
|
||||
void Interpolate::InterpolateExecutorBase::buildTblCubic(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, const std::vector<float>& dataScales,
|
||||
float cubicCoeff, InterpolateLayoutType layout) {
|
||||
int dimSize = dataRank;
|
||||
float fy = dataScales[dimSize - 2];
|
||||
@ -3085,7 +3136,7 @@ void Interpolate::InterpolateRefExecutor::linearInterpolation(const uint8_t *in_
|
||||
});
|
||||
}
|
||||
|
||||
Interpolate::InterpolateExecutor::InterpolateExecutor(const InterpolateAttrs& interpAttrs,
|
||||
Interpolate::InterpolateExecutorBase::InterpolateExecutorBase(const InterpolateAttrs& interpAttrs,
|
||||
const VectorDims &srcDims,
|
||||
const VectorDims &dstDims,
|
||||
const std::vector<float> &dataScales) :
|
||||
@ -3128,7 +3179,7 @@ Interpolate::InterpolateJitExecutor::InterpolateJitExecutor(const InterpolateAtt
|
||||
const VectorDims &dstDims,
|
||||
const std::vector<float> &dataScales,
|
||||
const dnnl::primitive_attr &attr) :
|
||||
InterpolateExecutor(interpAttrs, srcDims, dstDims, dataScales) {
|
||||
InterpolateExecutorBase(interpAttrs, srcDims, dstDims, dataScales) {
|
||||
auto jcp = jit_interpolate_config_params();
|
||||
jcp.mode = mode;
|
||||
jcp.src_prc = interpAttrs.inPrc;
|
||||
@ -3145,6 +3196,7 @@ Interpolate::InterpolateJitExecutor::InterpolateJitExecutor(const InterpolateAtt
|
||||
jcp.ID = srcDimPad5d[2];
|
||||
jcp.spatial_dim_size = getSpatialDimsNum(srcDims.size());
|
||||
jcp.layout = interpAttrs.layout;
|
||||
#if defined(OPENVINO_ARCH_X86_64)
|
||||
if (jcp.layout != InterpolateLayoutType::planar) {
|
||||
if (mayiuse(cpu::x64::avx512_core)) {
|
||||
interpolateKernel.reset(new jit_uni_interpolate_kernel_f32<cpu::x64::avx512_core>(jcp, *attr.get()));
|
||||
@ -3159,6 +3211,7 @@ Interpolate::InterpolateJitExecutor::InterpolateJitExecutor(const InterpolateAtt
|
||||
} else {
|
||||
IE_THROW() << "Can't create InterpolateJitExecutor";
|
||||
}
|
||||
#endif // OPENVINO_ARCH_X86_64
|
||||
if (interpolateKernel) {
|
||||
interpolateKernel->create_ker();
|
||||
} else {
|
||||
@ -3266,4 +3319,4 @@ bool Interpolate::created() const {
|
||||
|
||||
} // namespace node
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
||||
} // namespace ov
|
@ -9,6 +9,8 @@
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include "executors/interpolate.hpp"
|
||||
#include "executors/interpolate_list.hpp"
|
||||
|
||||
#define MAX_INPUT_INTERPOLATE 8
|
||||
|
||||
@ -18,40 +20,6 @@ namespace ov {
|
||||
namespace intel_cpu {
|
||||
namespace node {
|
||||
|
||||
enum InterpolateLayoutType {
|
||||
planar,
|
||||
block,
|
||||
by_channel
|
||||
};
|
||||
|
||||
enum InterpolateMode {
|
||||
nearest,
|
||||
linear,
|
||||
linear_onnx,
|
||||
cubic
|
||||
};
|
||||
|
||||
enum InterpolateCoordTransMode {
|
||||
half_pixel,
|
||||
pytorch_half_pixel,
|
||||
asymmetric,
|
||||
tf_half_pixel_for_nn,
|
||||
align_corners
|
||||
};
|
||||
|
||||
enum class InterpolateNearestMode {
|
||||
round_prefer_floor,
|
||||
round_prefer_ceil,
|
||||
floor,
|
||||
ceil,
|
||||
simple
|
||||
};
|
||||
|
||||
enum class InterpolateShapeCalcMode {
|
||||
sizes,
|
||||
scales
|
||||
};
|
||||
|
||||
struct jit_interpolate_config_params {
|
||||
InterpolateLayoutType layout;
|
||||
InterpolateMode mode;
|
||||
@ -121,31 +89,18 @@ public:
|
||||
bool needPrepareParams() const override;
|
||||
void prepareParams() override;
|
||||
|
||||
struct InterpolateAttrs {
|
||||
InterpolateMode mode = InterpolateMode::nearest;
|
||||
InterpolateCoordTransMode coordTransMode = InterpolateCoordTransMode::half_pixel;
|
||||
InterpolateNearestMode nearestMode = InterpolateNearestMode::round_prefer_floor;
|
||||
bool antialias = false;
|
||||
float cubeCoeff = -0.75;
|
||||
std::vector<int> padBegin;
|
||||
std::vector<int> padEnd;
|
||||
InferenceEngine::Precision inPrc;
|
||||
InferenceEngine::Precision outPrc;
|
||||
InterpolateLayoutType layout;
|
||||
};
|
||||
|
||||
private:
|
||||
InterpolateAttrs interpAttrs;
|
||||
|
||||
class InterpolateExecutor {
|
||||
class InterpolateExecutorBase {
|
||||
public:
|
||||
InterpolateExecutor(const InterpolateAttrs& interpAttrs,
|
||||
InterpolateExecutorBase(const InterpolateAttrs& interpAttrs,
|
||||
const VectorDims &srcDims,
|
||||
const VectorDims &dstDims,
|
||||
const std::vector<float> &dataScales);
|
||||
|
||||
virtual void exec(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_) = 0;
|
||||
virtual ~InterpolateExecutor() = default;
|
||||
virtual ~InterpolateExecutorBase() = default;
|
||||
VectorDims getSrcDimPad5d() const { return srcDimPad5d; }
|
||||
|
||||
private:
|
||||
@ -174,9 +129,9 @@ private:
|
||||
size_t dataRank;
|
||||
std::vector<int> indexTable;
|
||||
};
|
||||
std::shared_ptr<InterpolateExecutor> execPtr = nullptr;
|
||||
std::shared_ptr<InterpolateExecutorBase> execPtr = nullptr;
|
||||
|
||||
class InterpolateJitExecutor : public InterpolateExecutor {
|
||||
class InterpolateJitExecutor : public InterpolateExecutorBase {
|
||||
public:
|
||||
InterpolateJitExecutor(const InterpolateAttrs& interpAttrs,
|
||||
const VectorDims &srcDims,
|
||||
@ -209,13 +164,13 @@ private:
|
||||
std::shared_ptr<jit_uni_interpolate_kernel> interpolateKernel = nullptr;
|
||||
};
|
||||
|
||||
class InterpolateRefExecutor : public InterpolateExecutor {
|
||||
class InterpolateRefExecutor : public InterpolateExecutorBase {
|
||||
public:
|
||||
InterpolateRefExecutor(const InterpolateAttrs& interpAttrs,
|
||||
const VectorDims &srcDims,
|
||||
const VectorDims &dstDims,
|
||||
const std::vector<float> &_dataScales) :
|
||||
InterpolateExecutor(interpAttrs, srcDims, dstDims, _dataScales),
|
||||
InterpolateExecutorBase(interpAttrs, srcDims, dstDims, _dataScales),
|
||||
antialias(interpAttrs.antialias), dataScales(_dataScales) {}
|
||||
|
||||
void exec(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_) override;
|
||||
@ -259,8 +214,11 @@ private:
|
||||
VectorDims lastOutputDims;
|
||||
|
||||
std::string errorPrefix;
|
||||
|
||||
bool canUseAclExecutor = false;
|
||||
std::shared_ptr<InterpolateExecutor> aclExecPtr = nullptr;
|
||||
};
|
||||
|
||||
} // namespace node
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
||||
} // namespace ov
|
@ -4,7 +4,7 @@
|
||||
|
||||
#pragma once
|
||||
#include <cpu/x64/jit_generator.hpp>
|
||||
#include <emitters/jit_load_store_emitters.hpp>
|
||||
#include "emitters/x64/jit_load_store_emitters.hpp"
|
||||
#include <ie/ie_precision.hpp>
|
||||
#include <common/nstl.hpp>
|
||||
#include <type_traits>
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user