From e738c4e83fb1ad2c2daadc52955d8d385b240a92 Mon Sep 17 00:00:00 2001 From: Egor Duplenskii Date: Tue, 13 Jun 2023 15:55:27 +0200 Subject: [PATCH] [CPU] Use primitive priority list more efficiently (#17135) --- .../intel_cpu/src/dnnl_extension_utils.cpp | 26 +-- .../intel_cpu/src/dnnl_extension_utils.h | 40 ++++ .../src/memory_desc/blocked_memory_desc.cpp | 8 +- .../src/memory_desc/blocked_memory_desc.h | 16 +- .../src/memory_desc/cpu_blocked_memory_desc.h | 6 +- .../memory_desc/dnnl_blocked_memory_desc.cpp | 2 +- .../memory_desc/dnnl_blocked_memory_desc.h | 4 +- src/plugins/intel_cpu/src/node.cpp | 206 ++++++++++-------- src/plugins/intel_cpu/src/node.h | 28 ++- .../intel_cpu/src/nodes/adaptive_pooling.cpp | 3 - src/plugins/intel_cpu/src/nodes/bin_conv.cpp | 5 +- src/plugins/intel_cpu/src/nodes/concat.cpp | 6 +- src/plugins/intel_cpu/src/nodes/conv.cpp | 192 +++++++--------- src/plugins/intel_cpu/src/nodes/conv.h | 7 +- src/plugins/intel_cpu/src/nodes/deconv.cpp | 8 +- src/plugins/intel_cpu/src/nodes/deconv.h | 4 +- src/plugins/intel_cpu/src/nodes/eltwise.cpp | 8 +- .../src/nodes/executors/executor.hpp | 17 +- src/plugins/intel_cpu/src/nodes/eye.cpp | 2 - .../intel_cpu/src/nodes/fullyconnected.cpp | 160 +++++++------- .../intel_cpu/src/nodes/fullyconnected.h | 6 +- .../intel_cpu/src/nodes/interpolate.cpp | 2 +- src/plugins/intel_cpu/src/nodes/lrn.cpp | 6 +- src/plugins/intel_cpu/src/nodes/lrn.h | 2 +- src/plugins/intel_cpu/src/nodes/matmul.cpp | 134 ++++++------ src/plugins/intel_cpu/src/nodes/matmul.h | 4 +- src/plugins/intel_cpu/src/nodes/mvn.cpp | 2 +- src/plugins/intel_cpu/src/nodes/node_config.h | 75 ++++--- src/plugins/intel_cpu/src/nodes/non_zero.cpp | 2 - src/plugins/intel_cpu/src/nodes/pooling.cpp | 105 +++++---- src/plugins/intel_cpu/src/nodes/reduce.cpp | 7 +- src/plugins/intel_cpu/src/nodes/reorder.cpp | 7 +- src/plugins/intel_cpu/src/nodes/reorder.h | 2 +- src/plugins/intel_cpu/src/nodes/rnn.cpp | 8 +- src/plugins/intel_cpu/src/nodes/rnn.h | 4 +- src/plugins/intel_cpu/src/nodes/roi_align.cpp | 3 - .../intel_cpu/src/nodes/roi_pooling.cpp | 3 - src/plugins/intel_cpu/src/nodes/shapeof.cpp | 2 - src/plugins/intel_cpu/src/nodes/softmax.cpp | 5 +- src/plugins/intel_cpu/src/nodes/split.cpp | 22 +- src/plugins/intel_cpu/src/nodes/subgraph.cpp | 6 +- .../intel_cpu/src/onednn/iml_type_mapper.cpp | 5 + .../intel_cpu/src/onednn/iml_type_mapper.h | 2 + .../intel_cpu/src/utils/debug_caps_config.h | 36 +-- .../intel_cpu/src/utils/ngraph_utils.hpp | 2 +- .../single_layer_tests/convolution.cpp | 17 ++ .../single_layer_tests/group_convolution.cpp | 6 +- .../functional/single_layer_tests/matmul.cpp | 2 +- .../test_utils/convolution_params.hpp | 6 +- 49 files changed, 647 insertions(+), 584 deletions(-) diff --git a/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp b/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp index c598ed69a59..b59e0ac857c 100644 --- a/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp +++ b/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp @@ -179,21 +179,11 @@ std::string DnnlExtensionUtils::query_impl_info_str(const const_dnnl_primitive_d return std::string(res); } -bool DnnlExtensionUtils::find_implementation(dnnl::primitive_desc& desc, impl_desc_type implType) { - primitive_desc_iterator& itpd = desc; - - while (itpd) { - const impl_desc_type descImplType = parse_impl_name(itpd.impl_info_str()); - - if (descImplType == implType) { - return true; - } - - if (!itpd.next_impl()) - break; - } - - return false; +bool DnnlExtensionUtils::find_implementation(dnnl::primitive_desc& desc, impl_desc_type impl_type) { + return DnnlExtensionUtils::find_implementation(desc, + [impl_type](impl_desc_type cur_impl_type){ + return cur_impl_type == impl_type; + }); } dnnl_memory_desc_t DnnlExtensionUtils::clone_desc(const_dnnl_memory_desc_t cdesc) { @@ -202,6 +192,12 @@ dnnl_memory_desc_t DnnlExtensionUtils::clone_desc(const_dnnl_memory_desc_t cdesc return cloned_md; } +dnnl_primitive_desc_t DnnlExtensionUtils::clone_primitive_desc(const_dnnl_primitive_desc_t cprim_desc) { + dnnl_primitive_desc_t cloned_md = nullptr; + dnnl_primitive_desc_clone(&cloned_md, cprim_desc); + return cloned_md; +} + const char* DnnlExtensionUtils::query_pd_info(const_dnnl_primitive_desc_t pd) { return pd->info(); } diff --git a/src/plugins/intel_cpu/src/dnnl_extension_utils.h b/src/plugins/intel_cpu/src/dnnl_extension_utils.h index aa672746e9e..8d557fed5d7 100644 --- a/src/plugins/intel_cpu/src/dnnl_extension_utils.h +++ b/src/plugins/intel_cpu/src/dnnl_extension_utils.h @@ -54,7 +54,47 @@ public: static std::shared_ptr query_md(const const_dnnl_primitive_desc_t& pd, const dnnl::query& what, int idx = 0); static std::string query_impl_info_str(const const_dnnl_primitive_desc_t& pd); + + template + static bool find_implementation(dnnl::primitive_desc& desc, T&& comparator) { + dnnl::primitive_desc_iterator& itpd = desc; + + while (itpd) { + const impl_desc_type descImplType = parse_impl_name(itpd.impl_info_str()); + + if (comparator(descImplType)) { + return true; + } + + if (!itpd.next_impl()) + break; + } + + return false; + } + + template + static void for_each_implementation(dnnl::primitive_desc& desc, bool first_match, T&& comparator, L&& func) { + dnnl::primitive_desc_iterator& itpd = desc; + + while (itpd) { + const impl_desc_type descImplType = parse_impl_name(itpd.impl_info_str()); + + if (comparator(descImplType)) { + func(itpd); + if (first_match) + break; + } + + if (!itpd.next_impl()) + break; + } + + return; + } + static bool find_implementation(dnnl::primitive_desc& desc, impl_desc_type implType); + static dnnl_primitive_desc_t clone_primitive_desc(const_dnnl_primitive_desc_t cprim_desc); static dnnl_memory_desc_t clone_desc(const_dnnl_memory_desc_t cdesc); static const char* query_pd_info(const_dnnl_primitive_desc_t pd); static dnnl::algorithm convertToDnnlAlgorithm(Algorithm alg); diff --git a/src/plugins/intel_cpu/src/memory_desc/blocked_memory_desc.cpp b/src/plugins/intel_cpu/src/memory_desc/blocked_memory_desc.cpp index 6ba3837f349..7c17a1381b1 100644 --- a/src/plugins/intel_cpu/src/memory_desc/blocked_memory_desc.cpp +++ b/src/plugins/intel_cpu/src/memory_desc/blocked_memory_desc.cpp @@ -8,6 +8,12 @@ namespace ov { namespace intel_cpu { +/* c++11 requires to have a definition in cpp file */ +constexpr BlockedMemoryDesc::CmpMask BlockedMemoryDesc::FULL_MASK; +constexpr BlockedMemoryDesc::CmpMask BlockedMemoryDesc::EMPTY_MASK; +constexpr BlockedMemoryDesc::CmpMask BlockedMemoryDesc::SKIP_OFFSET_MASK; +constexpr size_t BlockedMemoryDesc::OFFSET_MASK_POS; + bool BlockedMemoryDesc::isCompatibleInternal(const BlockedMemoryDesc &rhs, CmpMask cmpMask) const { if (this->getShape() != rhs.getShape() || this->getPrecision() != rhs.getPrecision()) return false; @@ -35,7 +41,7 @@ bool BlockedMemoryDesc::isCompatibleInternal(const BlockedMemoryDesc &rhs, CmpMa return false; } - if (cmpMask.test(BLOCKED_DESC_OFFSET_MASK_POS)) { + if (cmpMask.test(OFFSET_MASK_POS)) { return dimsEqualWeak(this->getOffsetPadding(), rhs.getOffsetPadding()); } diff --git a/src/plugins/intel_cpu/src/memory_desc/blocked_memory_desc.h b/src/plugins/intel_cpu/src/memory_desc/blocked_memory_desc.h index 29a9204b30c..e554f28dbb9 100644 --- a/src/plugins/intel_cpu/src/memory_desc/blocked_memory_desc.h +++ b/src/plugins/intel_cpu/src/memory_desc/blocked_memory_desc.h @@ -11,17 +11,17 @@ namespace ov { namespace intel_cpu { -#define BLOCKED_DESC_FULL_MASK 0xffffffff -#define BLOCKED_DESC_EMPTY_MASK 0x0 -#define BLOCKED_DESC_SKIP_OFFSET_MASK 0x7fffffff -#define BLOCKED_DESC_OFFSET_MASK_POS 31 - class BlockedMemoryDesc : public virtual MemoryDesc { public: using CmpMask = std::bitset<32>; public: - BlockedMemoryDesc() {} + BlockedMemoryDesc() = default; + + static constexpr CmpMask FULL_MASK{0xffffffff}; + static constexpr CmpMask EMPTY_MASK{0x0}; + static constexpr CmpMask SKIP_OFFSET_MASK{0x7fffffff}; + static constexpr size_t OFFSET_MASK_POS{31}; /** * @brief Returns the blocked dimensions @@ -76,7 +76,7 @@ public: virtual bool isCompatible(const BlockedMemoryDesc &rhs, CmpMask cmpMask) const = 0; using MemoryDesc::isCompatible; - virtual ~BlockedMemoryDesc() = default; + ~BlockedMemoryDesc() override = default; std::string serializeFormat() const override; @@ -88,7 +88,7 @@ protected: * Doesn't perform descs specific attributes check * @return true if compatible, otherwise false */ - bool isCompatibleInternal(const BlockedMemoryDesc &rhs, CmpMask cmpMask = BLOCKED_DESC_FULL_MASK) const; + bool isCompatibleInternal(const BlockedMemoryDesc &rhs, CmpMask cmpMask = FULL_MASK) const; mutable VectorDims blockedDims; mutable VectorDims strides; diff --git a/src/plugins/intel_cpu/src/memory_desc/cpu_blocked_memory_desc.h b/src/plugins/intel_cpu/src/memory_desc/cpu_blocked_memory_desc.h index 0be05302518..8f0d0033b8a 100644 --- a/src/plugins/intel_cpu/src/memory_desc/cpu_blocked_memory_desc.h +++ b/src/plugins/intel_cpu/src/memory_desc/cpu_blocked_memory_desc.h @@ -24,8 +24,8 @@ public: bool isCompatible(const MemoryDesc& rhs) const override; bool isCompatible(const BlockedMemoryDesc& rhs, CmpMask cmpMask) const override; - bool isCompatible(const CpuBlockedMemoryDesc &rhs, CmpMask cmpMask = BLOCKED_DESC_FULL_MASK) const; - bool isCompatible(const DnnlBlockedMemoryDesc &rhs, CmpMask cmpMask = BLOCKED_DESC_FULL_MASK) const; + bool isCompatible(const CpuBlockedMemoryDesc &rhs, CmpMask cmpMask = BlockedMemoryDesc::FULL_MASK) const; + bool isCompatible(const DnnlBlockedMemoryDesc &rhs, CmpMask cmpMask = BlockedMemoryDesc::FULL_MASK) const; InferenceEngine::Precision getPrecision() const override { return precision; @@ -92,7 +92,7 @@ private: MemoryDescPtr cloneWithNewDimsImp(const VectorDims& dims) const override; void setPrecision(InferenceEngine::Precision prc) override { - precision = std::move(prc); + precision = prc; } private: diff --git a/src/plugins/intel_cpu/src/memory_desc/dnnl_blocked_memory_desc.cpp b/src/plugins/intel_cpu/src/memory_desc/dnnl_blocked_memory_desc.cpp index bb3fe70481b..a8e585cc031 100644 --- a/src/plugins/intel_cpu/src/memory_desc/dnnl_blocked_memory_desc.cpp +++ b/src/plugins/intel_cpu/src/memory_desc/dnnl_blocked_memory_desc.cpp @@ -279,7 +279,7 @@ bool DnnlBlockedMemoryDesc::isCompatible(const DnnlBlockedMemoryDesc& rhs, CmpMa return false; const uint64_t stride_mask = (0xffffffffffffffff << cmpMask.size()) | cmpMask.to_ullong(); - const bool checkOffset = cmpMask.test(BLOCKED_DESC_OFFSET_MASK_POS); + const bool checkOffset = cmpMask.test(OFFSET_MASK_POS); const auto thisExtra = wrappedThis.extra(); const auto rhsExtra = wrappedRhs.extra(); diff --git a/src/plugins/intel_cpu/src/memory_desc/dnnl_blocked_memory_desc.h b/src/plugins/intel_cpu/src/memory_desc/dnnl_blocked_memory_desc.h index 169d8543550..6c00ec7564c 100644 --- a/src/plugins/intel_cpu/src/memory_desc/dnnl_blocked_memory_desc.h +++ b/src/plugins/intel_cpu/src/memory_desc/dnnl_blocked_memory_desc.h @@ -28,8 +28,8 @@ public: bool isCompatible(const MemoryDesc& rhs) const override; bool isCompatible(const BlockedMemoryDesc& rhs, CmpMask cmpMask) const override; - bool isCompatible(const CpuBlockedMemoryDesc &rhs, CmpMask cmpMask = BLOCKED_DESC_FULL_MASK) const; - bool isCompatible(const DnnlBlockedMemoryDesc &rhs, CmpMask cmpMask = BLOCKED_DESC_FULL_MASK) const; + bool isCompatible(const CpuBlockedMemoryDesc &rhs, CmpMask cmpMask = FULL_MASK) const; + bool isCompatible(const DnnlBlockedMemoryDesc &rhs, CmpMask cmpMask = FULL_MASK) const; const VectorDims& getBlockDims() const override { return blockedDims; diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp index f0bd5e16890..7b87f09007b 100644 --- a/src/plugins/intel_cpu/src/node.cpp +++ b/src/plugins/intel_cpu/src/node.cpp @@ -87,13 +87,13 @@ Node::Node(const std::shared_ptr& op, temporary(false), constant(ConstantType::Unknown), context(ctx), + algorithm(Algorithm::Default), + fusingPort(-1), engine(ctx->getEngine()), name(op->get_friendly_name()), typeStr(op->get_type_name()), type(TypeFromName(op->get_type_name())), profiling(op->get_friendly_name()) { - algorithm = Algorithm::Default; - fusingPort = -1; const std::string errorPrefix = "Ngraph operation " + std::string(op->get_type_name()) + " with name " + op->get_friendly_name(); for (size_t i = 0; i < op->get_input_size(); i++) { @@ -139,18 +139,21 @@ Node::Node(const std::shared_ptr& op, addOriginalLayer(name); } - auto primitivesPriority = getPrimitivesPriorityValue(op); + auto primitivesPriority = getImplPriorityValue(op); if (!primitivesPriority.empty()) { std::istringstream stream(primitivesPriority); std::string str; while (getline(stream, str, ',')) { if (str.substr(0, 4) != "cpu:") continue; - implPriorities.push_back(parse_impl_name(str)); - if (implPriorities[implPriorities.size() - 1] == impl_desc_type::unknown && + customImplPriorities.push_back(parse_impl_name(str)); + if (customImplPriorities.back() == impl_desc_type::unknown && str != "cpu:unknown") IE_THROW() << "Unsupported CPU implementation " << str << " for node " << getName(); } + // add default primitive priorities as a fallback for the custom ones + const auto& defaultImplPriorities = getDefaultImplPriority(); + customImplPriorities.insert(customImplPriorities.end(), defaultImplPriorities.begin(), defaultImplPriorities.end()); } std::string inputMemoryFormats = getInputMemoryFormats(op); @@ -262,7 +265,7 @@ void Node::createPrimitive() { } void Node::selectOptimalPrimitiveDescriptor() { - selectPreferPrimitiveDescriptor(getPrimitivesPriority(), false); + selectPreferPrimitiveDescriptor(getImplPriority(), false); } void Node::selectPreferPrimitiveDescriptor(const std::vector& priority, bool ignoreConstInputs) { @@ -621,44 +624,51 @@ void Node::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; - auto attr = initPrimitiveAttr(); + auto addSupportedPrimitiveDescriptor = [&](const dnnl::primitive_desc& prim_desc) { + std::vector inConfs, outConfs; + const int inPlaceOutPort = canBeInPlace() ? 0 : -1; + + for (size_t i = 0; i < descInputNumbers(); i++) { + auto desc = getSrcMemDesc(prim_desc, i); + + inConfs.emplace_back(desc, BlockedMemoryDesc::EMPTY_MASK); + } + + for (size_t i = 0; i < descOutputNumbers(); i++) { + auto desc = getDstMemDesc(prim_desc, i); + + outConfs.emplace_back(desc, BlockedMemoryDesc::EMPTY_MASK, inPlaceOutPort); + } + + const NodeConfig config(inConfs, outConfs); + const impl_desc_type impl_type = parse_impl_name(prim_desc.impl_info_str()); + + supportedPrimitiveDescriptors.emplace_back(config, impl_type); + }; + + /* When custom implementation priorities are NOT defined it is enough to + * just use the first implementation from the priority list. + * When custom implementation priorities are defined, all the implementations should be considered, + * since custom implementations can be not available at all, so a fallback to the default ones must happen + * To achive the fallback, it is necessary to create a supported primitive descriptor for each implementation + * since oneDNN primitive is mutating while iterating */ for (auto& desc : descs) { - primitive_desc_iterator itpd = desc; + auto first_desc = dnnl::primitive_desc(DnnlExtensionUtils::clone_primitive_desc(desc.get())); + const bool first_match = customImplPriorities.empty(); + DnnlExtensionUtils::for_each_implementation(desc, + first_match, + [&](impl_desc_type implType) { + return contains(getImplPriority(), implType); + }, + [&](dnnl::primitive_desc& desc) { + addSupportedPrimitiveDescriptor(desc); + }); - while (static_cast(itpd)) { - NodeConfig config; - for (size_t i = 0; i < descInputNumbers(); i++) { - PortConfig portConfig; - portConfig.inPlace(-1); - portConfig.constant(false); - auto desc = getSrcMemDesc(itpd, i); - if (desc->getType() & MemoryDescType::Blocked) { - portConfig.setMemDesc(std::dynamic_pointer_cast(desc), BLOCKED_DESC_EMPTY_MASK); - } else { - portConfig.setMemDesc(std::move(desc)); - } - config.inConfs.push_back(portConfig); - } - - for (size_t i = 0; i < descOutputNumbers(); i++) { - PortConfig portConfig; - portConfig.inPlace(canBeInPlace() ? 0 : -1); - portConfig.constant(false); - auto desc = getDstMemDesc(itpd, i); - if (desc->getType() & MemoryDescType::Blocked) { - portConfig.setMemDesc(std::dynamic_pointer_cast(desc), BLOCKED_DESC_EMPTY_MASK); - } else { - portConfig.setMemDesc(std::move(desc)); - } - config.outConfs.push_back(portConfig); - } - impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str()); - - supportedPrimitiveDescriptors.emplace_back(config, impl_type); - if (!itpd.next_impl()) - break; - } + // fallback. if none of the primitive types is present in the priority list just add first implementation + // @todo this fallback is not necessary if primitive priority list is filled correctly + if (supportedPrimitiveDescriptors.empty()) + addSupportedPrimitiveDescriptor(first_desc); } } @@ -971,51 +981,56 @@ void Node::cleanup() { } } -const std::vector& Node::getPrimitivesPriority() { - std::vector priorities = { - impl_desc_type::unknown, - // Undef impl type is used to express use-cases there real type is unkown during compilation - // Undef has higher priority than defined types in order to force primitive selection logic to make decision based on other properties - impl_desc_type::undef, - impl_desc_type::brgconv_avx512_amx_1x1, - impl_desc_type::brgconv_avx512_amx, - impl_desc_type::jit_avx512_amx_dw, - impl_desc_type::jit_avx512_amx_1x1, - impl_desc_type::jit_avx512_amx, - // Brgconv kernels disabled in order to prevent perf degradations on non AMX HW - // impl_desc_type::brgconv_avx512_1x1, - // impl_desc_type::brgconv_avx512, - impl_desc_type::jit_uni_dw, - impl_desc_type::jit_uni_1x1, - impl_desc_type::jit_uni, - impl_desc_type::jit_avx512_dw, - impl_desc_type::jit_avx512_1x1, - impl_desc_type::jit_avx512, - impl_desc_type::jit_avx2_dw, - impl_desc_type::jit_avx2_1x1, - impl_desc_type::jit_avx2, - impl_desc_type::jit_avx_dw, - impl_desc_type::jit_avx_1x1, - impl_desc_type::jit_avx, - impl_desc_type::jit_sse42_dw, - impl_desc_type::jit_sse42_1x1, - impl_desc_type::jit_sse42, - impl_desc_type::gemm_any, - impl_desc_type::gemm_blas, - impl_desc_type::gemm_avx512, - impl_desc_type::gemm_avx2, - impl_desc_type::gemm_avx, - impl_desc_type::gemm_sse42, - impl_desc_type::acl, - impl_desc_type::jit_gemm, - impl_desc_type::ref_any, - impl_desc_type::ref, +const std::vector& Node::getDefaultImplPriority() { + static const std::vector priorities { + impl_desc_type::unknown, + // Undef impl type is used to express use-cases there real type is unkown during compilation + // Undef has higher priority than defined types in order to force primitive selection logic to make decision based on other properties + impl_desc_type::undef, + impl_desc_type::brgconv_avx512_amx_1x1, + impl_desc_type::brgconv_avx512_amx, + impl_desc_type::jit_avx512_amx_dw, + impl_desc_type::jit_avx512_amx_1x1, + impl_desc_type::jit_avx512_amx, + // Brgconv kernels disabled in order to prevent perf degradations on non AMX HW + // impl_desc_type::brgconv_avx512_1x1, + // impl_desc_type::brgconv_avx512, + impl_desc_type::jit_uni_dw, + impl_desc_type::jit_uni_1x1, + impl_desc_type::jit_uni, + impl_desc_type::jit_avx512_dw, + impl_desc_type::jit_avx512_1x1, + impl_desc_type::jit_avx512, + impl_desc_type::jit_avx2_dw, + impl_desc_type::jit_avx2_1x1, + impl_desc_type::jit_avx2, + impl_desc_type::jit_avx_dw, + impl_desc_type::jit_avx_1x1, + impl_desc_type::jit_avx, + impl_desc_type::jit_sse42_dw, + impl_desc_type::jit_sse42_1x1, + impl_desc_type::jit_sse42, + impl_desc_type::gemm_any, + impl_desc_type::gemm_blas, + impl_desc_type::gemm_avx512, + impl_desc_type::gemm_avx2, + impl_desc_type::gemm_avx, + impl_desc_type::gemm_sse42, + impl_desc_type::acl, + impl_desc_type::jit_gemm, + impl_desc_type::ref_any, + impl_desc_type::ref, }; - for (const auto& impl : priorities) { - if (std::find(implPriorities.begin(), implPriorities.end(), impl) == implPriorities.end()) - implPriorities.push_back(impl); - } - return implPriorities; + + return priorities; +} + +const std::vector& Node::getImplPriority() { + if (!customImplPriorities.empty()) + return customImplPriorities; + + + return getDefaultImplPriority(); } PortDescBasePtr Node::getConsistentInputDesc(const NodeConfig &config, size_t idx) const { @@ -1126,7 +1141,7 @@ void Node::initOptimalPrimitiveDescriptor() { // it is assumed that the nodes will define dense tensors on output edges // if it is not the case the implementation must redefine this behaviour if (outMemDesc->getType() & Blocked) { - config.outConfs[i].setMemDesc(std::dynamic_pointer_cast(outMemDesc), BLOCKED_DESC_FULL_MASK); + config.outConfs[i].setMemDesc(std::dynamic_pointer_cast(outMemDesc), BlockedMemoryDesc::FULL_MASK); } } } @@ -1144,18 +1159,18 @@ bool Node::isConfigDefined(const NodeConfig &config) const { return true; } -MemoryDescPtr Node::getSrcMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) { +MemoryDescPtr Node::getSrcMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const { if (getInputShapeAtPort(idx).isDynamic()) { - return DnnlExtensionUtils::makeUndefinedDesc(primitive_desc_it.src_desc(idx), getInputShapeAtPort(idx)); + return DnnlExtensionUtils::makeUndefinedDesc(prim_desc.src_desc(idx), getInputShapeAtPort(idx)); } - return DnnlExtensionUtils::makeDescriptor(primitive_desc_it.src_desc(idx)); + return DnnlExtensionUtils::makeDescriptor(prim_desc.src_desc(idx)); } -MemoryDescPtr Node::getDstMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) { +MemoryDescPtr Node::getDstMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const { if (getOutputShapeAtPort(idx).isDynamic()) { - return DnnlExtensionUtils::makeUndefinedDesc(primitive_desc_it.dst_desc(idx), getOutputShapeAtPort(idx)); + return DnnlExtensionUtils::makeUndefinedDesc(prim_desc.dst_desc(idx), getOutputShapeAtPort(idx)); } - return DnnlExtensionUtils::makeDescriptor(primitive_desc_it.dst_desc(idx)); + return DnnlExtensionUtils::makeDescriptor(prim_desc.dst_desc(idx)); } void Node::appendPostOpArgs(const dnnl::primitive_attr& attr, @@ -1627,15 +1642,16 @@ void Node::addSupportedPrimDesc(const std::vector& inPortConfi if (!fill_port(outPortConfigs[i], dims, prc, config.outConfs)) return; } - supportedPrimitiveDescriptors.push_back({config, implType}); + + supportedPrimitiveDescriptors.emplace_back(config, implType); } void Node::initializeDQScales(const float* scaleData, const size_t scaleSize) { - bool scalePerTensor; if (!DQScales.empty() || !scaleSize) IE_THROW() << "DQ scales is preset or scale size is 0, ##" << getName(); DQScales.reserve(scaleSize); - scalePerTensor = true; + + bool scalePerTensor = true; for (size_t i = 0; i < scaleSize; i++) { DQScales.push_back(scaleData[i]); if (scaleData[i] != scaleData[0]) diff --git a/src/plugins/intel_cpu/src/node.h b/src/plugins/intel_cpu/src/node.h index d11085bcdcf..9e94026de8d 100644 --- a/src/plugins/intel_cpu/src/node.h +++ b/src/plugins/intel_cpu/src/node.h @@ -76,15 +76,11 @@ private: class NodeDesc { public: - NodeDesc(const NodeConfig& conf, impl_desc_type type): config(conf) { - implementationType = type; - executorFactory = nullptr; - } + NodeDesc(NodeConfig conf, impl_desc_type type): + config(std::move(conf)), implementationType(type), executorFactory(nullptr) {} - NodeDesc(const NodeConfig& conf, impl_desc_type type, ExecutorFactoryPtr factory): config(conf) { - implementationType = type; - executorFactory = factory; - } + NodeDesc(NodeConfig conf, impl_desc_type type, ExecutorFactoryPtr factory): + config(std::move(conf)), implementationType(type), executorFactory(factory) {} const NodeConfig& getConfig() const { return config; @@ -560,8 +556,8 @@ protected: virtual PortDescBasePtr getConsistentInputDesc(const NodeConfig &config, size_t idx) const; virtual PortDescBasePtr getConsistentOutputDesc(const NodeConfig &config, size_t idx) const; - virtual MemoryDescPtr getSrcMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx); - virtual MemoryDescPtr getDstMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx); + virtual MemoryDescPtr getSrcMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const; + virtual MemoryDescPtr getDstMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const; virtual AttrPtr initPrimitiveAttr() { return nullptr; } @@ -574,7 +570,7 @@ protected: std::vector fusedWith; std::vector mergedWith; - std::vector implPriorities; + std::vector customImplPriorities; std::vector inputMemoryFormatsFilter; std::vector outputMemoryFormatsFilter; bool enforceBF16evenForGraphTail = false; @@ -619,7 +615,11 @@ protected: bool isConfigDefined(const NodeConfig &config) const; virtual bool canBeInPlace() const; - virtual const std::vector& getPrimitivesPriority(); + /* returns default implementaion prioirity */ + virtual const std::vector& getDefaultImplPriority(); + /* returns custom implementation priority + default implementation priority appended as a fallback + * if custom implementaiton priority is not specified, returns default implementation priority */ + const std::vector& getImplPriority(); virtual std::vector getAvailableFormatsForDims(const Shape& dims) const; @@ -724,9 +724,7 @@ private: // copies of same content with different layouts. std::unordered_map privateWeightCache; -#ifdef CPU_DEBUG_CAPS - friend class Verbose; -#endif + CPU_DEBUG_CAP_ENABLE(friend class Verbose); }; template diff --git a/src/plugins/intel_cpu/src/nodes/adaptive_pooling.cpp b/src/plugins/intel_cpu/src/nodes/adaptive_pooling.cpp index 401fb32f675..78efb60a2a1 100644 --- a/src/plugins/intel_cpu/src/nodes/adaptive_pooling.cpp +++ b/src/plugins/intel_cpu/src/nodes/adaptive_pooling.cpp @@ -116,9 +116,6 @@ AdaptivePooling::AdaptivePooling(const std::shared_ptr& op, const } void AdaptivePooling::getSupportedDescriptors() { - if (!descs.empty()) - return; - if (getParentEdges().size() != 2) IE_THROW() << errorPrefix << "has incorrect number of input edges: " << getParentEdges().size(); if (getChildEdges().size() < (algorithm == Algorithm::AdaptivePoolingMax ? 2 : 1)) diff --git a/src/plugins/intel_cpu/src/nodes/bin_conv.cpp b/src/plugins/intel_cpu/src/nodes/bin_conv.cpp index 90b560977b7..977c178c24d 100644 --- a/src/plugins/intel_cpu/src/nodes/bin_conv.cpp +++ b/src/plugins/intel_cpu/src/nodes/bin_conv.cpp @@ -929,9 +929,6 @@ BinaryConvolution::BinaryConvolution(const std::shared_ptr& op, co } void BinaryConvolution::getSupportedDescriptors() { - if (!descs.empty()) - return; - withBinarization = isFusedWith(Type::FakeQuantize); withSum = false; size_t expectedInputEdgesNum = 2; @@ -1153,7 +1150,7 @@ void BinaryConvolution::setPostOps(dnnl::primitive_attr &attr) { } void BinaryConvolution::executeOptimized(const uint8_t* src, const uint8_t* weights, uint8_t* dst, - const std::vector& s_str, const std::vector& w_str, const std::vector& d_str) { + const std::vector& s_str, const std::vector& w_str, const std::vector& d_str) { auto dst_f32 = reinterpret_cast(dst); const int MB = jcp.mb; diff --git a/src/plugins/intel_cpu/src/nodes/concat.cpp b/src/plugins/intel_cpu/src/nodes/concat.cpp index 6c75ad082c5..efafb27f15d 100644 --- a/src/plugins/intel_cpu/src/nodes/concat.cpp +++ b/src/plugins/intel_cpu/src/nodes/concat.cpp @@ -163,7 +163,7 @@ void Concat::initSupportedPrimitiveDescriptors() { if (isDynamicNode()) { config.inConfs[i].setMemDesc(desc); } else { - config.inConfs[i].setMemDesc(desc, BLOCKED_DESC_EMPTY_MASK); + config.inConfs[i].setMemDesc(desc, BlockedMemoryDesc::EMPTY_MASK); } } supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref); @@ -197,7 +197,7 @@ void Concat::initSupportedPrimitiveDescriptors() { SizeVector strides(numOfDim); strides.back() = 1lu; size_t offset = Shape::UNDEFINED_DIM; - BlockedMemoryDesc::CmpMask mask = BLOCKED_DESC_SKIP_OFFSET_MASK; // any offset + BlockedMemoryDesc::CmpMask mask = BlockedMemoryDesc::SKIP_OFFSET_MASK; // any offset for (size_t i = 2; i <= numOfDim; i++) { if (numOfDim - i < axis) { @@ -509,7 +509,7 @@ void Concat::initOptimalPrimitiveDescriptor() { firstOutBlockingDesc->getOffsetPadding() + offset, firstOutBlockingDesc->getOffsetPaddingToData(), firstOutBlockingDesc->getStrides()), - BLOCKED_DESC_FULL_MASK); + BlockedMemoryDesc::FULL_MASK); size_t axisSize = 1; auto firstInpBlockingDesc = config.inConfs[0].getMemDesc()->as(); diff --git a/src/plugins/intel_cpu/src/nodes/conv.cpp b/src/plugins/intel_cpu/src/nodes/conv.cpp index 31a7fa1656b..635b0e6825c 100644 --- a/src/plugins/intel_cpu/src/nodes/conv.cpp +++ b/src/plugins/intel_cpu/src/nodes/conv.cpp @@ -324,8 +324,8 @@ InferenceEngine::Precision Convolution::fusedEltwisePrecision(const NodePtr& fus return eltwisePrecision; } -const std::vector& Convolution::getPrimitivesPriority() { - std::vector priorities = { +const std::vector& Convolution::getDefaultImplPriority() { + static const std::vector priorities = { impl_desc_type::unknown, impl_desc_type::dw_acl, impl_desc_type::winograd_acl, @@ -363,11 +363,7 @@ const std::vector& Convolution::getPrimitivesPriority() { impl_desc_type::ref, }; - for (const auto& impl : priorities) { - if (std::find(implPriorities.begin(), implPriorities.end(), impl) == implPriorities.end()) - implPriorities.push_back(impl); - } - return implPriorities; + return priorities; } const bool Convolution::isBrgConvAvailable = dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core); @@ -381,10 +377,10 @@ void Convolution::getSupportedDescriptors() { attrs.reserve(2); withBiases = getOriginalInputsNumber() == 3; - if (!implPriorities.empty()) { + if (!customImplPriorities.empty()) { isPrimitivesPriorityDefined = true; // winograd support only constant weights and bias - isWino = std::find(implPriorities.begin(), implPriorities.end(), impl_desc_type::jit_avx512_winograd) != implPriorities.end() && + isWino = std::find(customImplPriorities.begin(), customImplPriorities.end(), impl_desc_type::jit_avx512_winograd) != customImplPriorities.end() && dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core) && !canBeExecutedInInt8() && getParentEdgeAt(1)->getParent()->isConstant() && getParentEdgeAt(1)->getParent()->getType() == Type::Input && (withBiases ? (getParentEdgeAt(2)->getParent()->isConstant() && getParentEdgeAt(2)->getParent()->getType() == Type::Input) : true); @@ -709,88 +705,79 @@ void Convolution::setPostOps(dnnl::primitive_attr& attr, } void Convolution::selectOptimalPrimitiveDescriptor() { - selectPreferPrimitiveDescriptor(getPrimitivesPriority(), true); + selectPreferPrimitiveDescriptor(getImplPriority(), true); } void Convolution::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; - bool containJitImpl = false; + auto getBlockedMask = [](const std::shared_ptr& memDesc, const bool isGrouped) { + if (memDesc->getType() & MemoryDescType::Blocked && !isGrouped) + return BlockedMemoryDesc::EMPTY_MASK; + return BlockedMemoryDesc::FULL_MASK; + }; + + auto addSupportedPrimitiveDescriptor = [&](const dnnl::primitive_desc& prim_desc) { + std::vector inConfs, outConfs; + const int inPlaceOutPort = withSum ? static_cast(getParentEdges().size()) - 1 : -1; + + for (size_t i = 0; i < descInputNumbers(); i++) { + auto desc = getSrcMemDesc(prim_desc, i); + + inConfs.emplace_back(desc, getBlockedMask(desc, isGrouped)); + } + + if (withDWConv) { + const std::vector dwWeightsDims{dw_conv_oc, 1, 1, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS]}; + const std::vector dwBiasesDims{dw_conv_oc}; + + const auto dwWeightsPrc = DnnlExtensionUtils::IEPrecisionToDataType(dw_conv_in_dt == dnnl_u8 ? Precision::I8 : Precision::FP32); + const auto dwWeightsDesc = std::make_shared(Shape(dwWeightsDims), dwWeightsPrc, memory::format_tag::Goihw8g); + inConfs.emplace_back(dwWeightsDesc); + + const auto dwBiasPrc = memory::data_type::f32; + const auto dwBiasDesc = std::make_shared(Shape(dwBiasesDims), dwBiasPrc, memory::format_tag::x); + inConfs.emplace_back(dwBiasDesc); + } + + for (size_t i = 0; i < descOutputNumbers(); i++) { + auto desc = getDstMemDesc(prim_desc, i); + + outConfs.emplace_back(desc, getBlockedMask(desc, isGrouped), inPlaceOutPort); + } + + if (withSum) { + const auto outputPrecision = outConfs.back().getMemDesc()->getPrecision(); + const auto sumDesc = getSumMemDesc(prim_desc)->cloneWithNewPrecision(outputPrecision); + inConfs.emplace_back(sumDesc); + } + + NodeConfig config(inConfs, outConfs); + const impl_desc_type impl_type = parse_impl_name(prim_desc.impl_info_str()); + + supportedPrimitiveDescriptors.emplace_back(config, impl_type); + }; for (size_t dIdx = 0; dIdx < descs.size(); dIdx++) { - const auto& desc = descs[dIdx]; + auto& desc = descs[dIdx]; + auto first_desc = dnnl::primitive_desc(DnnlExtensionUtils::clone_primitive_desc(desc.get())); - if (containJitImpl && isPossibleToSkipInitConfig(desc)) - continue; + const bool first_match = customImplPriorities.empty(); + DnnlExtensionUtils::for_each_implementation(desc, + first_match, + [&](impl_desc_type implType) { + return contains(getImplPriority(), implType); + }, + [&](dnnl::primitive_desc& desc) { + addSupportedPrimitiveDescriptor(desc); + descIdx.push_back(dIdx); + }); - auto itpd = desc; - while (itpd) { - NodeConfig config; - - for (size_t i = 0; i < descInputNumbers(); i++) { - PortConfig dataConfig; - dataConfig.inPlace(-1); - dataConfig.constant(false); - auto desc = getSrcMemDesc(itpd, i); - if (desc->getType() & MemoryDescType::Blocked && !isGrouped) { - dataConfig.setMemDesc(std::dynamic_pointer_cast(desc), BLOCKED_DESC_EMPTY_MASK); - } else { - dataConfig.setMemDesc(std::move(desc)); - } - - config.inConfs.push_back(dataConfig); - } - - if (withDWConv) { - auto weightsPrc = DnnlExtensionUtils::IEPrecisionToDataType(dw_conv_in_dt == dnnl_u8 ? Precision::I8 : Precision::FP32); - auto biasPrc = memory::data_type::f32; - - std::vector dwWeightsDims({dw_conv_oc, 1, 1, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS]}); - std::vector dwBiasesDims({dw_conv_oc}); - - PortConfig dataConfig; - dataConfig.inPlace(-1); - dataConfig.constant(false); - dataConfig.setMemDesc(std::make_shared(Shape(dwWeightsDims), weightsPrc, memory::format_tag::Goihw8g)); - config.inConfs.push_back(dataConfig); - - dataConfig.setMemDesc(std::make_shared(Shape(dwBiasesDims), biasPrc, memory::format_tag::x)); - config.inConfs.push_back(dataConfig); - } - - for (size_t i = 0; i < descOutputNumbers(); i++) { - PortConfig dataConfig; - if (withSum) { - dataConfig.inPlace(getParentEdges().size() - 1); - } - - dataConfig.constant(false); - auto desc = getDstMemDesc(itpd, i); - if (desc->getType() & MemoryDescType::Blocked && !isGrouped) { - dataConfig.setMemDesc(std::dynamic_pointer_cast(desc), BLOCKED_DESC_EMPTY_MASK); - } else { - dataConfig.setMemDesc(std::move(desc)); - } - - config.outConfs.push_back(dataConfig); - - if (withSum) { - dataConfig.inPlace(-1); - dataConfig.setMemDesc(getSumMemDesc(itpd)->cloneWithNewPrecision(dataConfig.getMemDesc()->getPrecision())); - config.inConfs.push_back(dataConfig); - } - } - impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str()); - if (impl_type & jit) - containJitImpl = true; - - supportedPrimitiveDescriptors.emplace_back(config, impl_type); - descIdx.push_back(dIdx); - - if (!itpd.next_impl()) - break; - } + // fallback. if none of the primitive types is present in the priority list just add first implementation + // @todo this fallback is not necessary if primitive priority list is filled correctly + if (supportedPrimitiveDescriptors.empty()) + addSupportedPrimitiveDescriptor(first_desc); } } @@ -894,7 +881,7 @@ void Convolution::createDescriptor(const std::vector& inputDesc, const auto desc = createDescriptorInternal(getEngine(), inDnnlDesc, weightDnnlDesc, biasDnnlDesc, outDnnlDesc, withBiases, stride, dilation, paddingL, paddingR, alg, attr); - if (desc.get(true)) + if (desc) descs.emplace_back(desc); } } @@ -1101,47 +1088,13 @@ void Convolution::filterSupportedDescriptors() { descs.erase(std::remove_if(descs.begin(), descs.end(), isNotSuitableDesc), descs.end()); } -bool Convolution::isPossibleToSkipInitConfig(const dnnl::primitive_desc &desc) const { - // WA: In some cases, we can predict in advance the type of primitive that will be called in the future. - // In particular, isPossibleToSkipInitConfig() checks whether we can skip the creation of primitives with - // gemm implementation, which significantly increase the network load time. - if (!inputMemoryFormatsFilter.empty() || !outputMemoryFormatsFilter.empty()) - return false; - - if (isPrimitivesPriorityDefined) - return false; - - // Here we check that we will not delete jit_planar_conv primitive by mistake. - // It requires: - // 1) strides equal 1; - // 2) not grouped; - // 3) first dim of weights is not 1. - bool isPossibleJitPlanar = true; - if (isGrouped || weightDims[0] != 1) - isPossibleJitPlanar = false; - - if (std::any_of(stride.begin(), stride.end(), [](const size_t s) { return s != 1; })) - isPossibleJitPlanar = false; - - auto srcMemDesc = DnnlExtensionUtils::makeDescriptor(desc.src_desc()); - auto dstMemDesc = DnnlExtensionUtils::makeDescriptor(desc.dst_desc()); - auto srcDataType = srcMemDesc->getDataType(); - auto dstDataType = dstMemDesc->getDataType(); - bool isPlanarFloatConv = srcMemDesc->hasLayoutType(LayoutType::ncsp) - && dstMemDesc->hasLayoutType(LayoutType::ncsp) - && srcDataType == memory::data_type::f32 - && dstDataType == memory::data_type::f32; - - return !isPossibleJitPlanar && isPlanarFloatConv; -} - -std::shared_ptr Convolution::getSrcMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) { +std::shared_ptr Convolution::getSrcMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const { if (idx == 1) { // report original plain layout for weight since it needs to be reordered dynamically at runtime return std::make_shared(getOriginalInputPrecisionAtPort(idx), Shape(getInputShapeAtPort(idx).getStaticDims())); } - auto desc = idx > 0 ? primitive_desc_it.weights_desc(idx - 1) : primitive_desc_it.src_desc(idx); + auto desc = idx > 0 ? prim_desc.weights_desc(idx - 1) : prim_desc.src_desc(idx); if (getInputShapeAtPort(idx).isDynamic()) { return DnnlExtensionUtils::makeUndefinedDesc(desc, getInputShapeAtPort(idx)); } @@ -1399,6 +1352,7 @@ void Convolution::prepareParams() { key.attr); const bool found = DnnlExtensionUtils::find_implementation(prim_desc, key.implType); + if (found) { return std::make_shared( prim_desc, @@ -1572,7 +1526,7 @@ void Convolution::redefineOutputMemory(const std::vector &newOutputS Node::redefineOutputMemory(newOutputShapes); } -MemoryDescPtr Convolution::getSumMemDesc(primitive_desc_iterator &primitive_desc_it) { +MemoryDescPtr Convolution::getSumMemDesc(const primitive_desc &primitive_desc_it) { if (getOutputShapeAtPort(0).isDynamic()) { return DnnlExtensionUtils::makeUndefinedDesc(primitive_desc_it.dst_desc(0), getOutputShapeAtPort(0)); } diff --git a/src/plugins/intel_cpu/src/nodes/conv.h b/src/plugins/intel_cpu/src/nodes/conv.h index 0b5ab483cc8..4b87d13c4e9 100644 --- a/src/plugins/intel_cpu/src/nodes/conv.h +++ b/src/plugins/intel_cpu/src/nodes/conv.h @@ -34,7 +34,7 @@ public: return false; } InferenceEngine::Precision getRuntimePrecision() const override; - std::shared_ptr getSrcMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) override; + std::shared_ptr getSrcMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const override; dnnl::memory getWeights() const; dnnl::memory getBias() const; @@ -73,7 +73,7 @@ protected: InferenceEngine::Precision fusedEltwisePrecision(const NodePtr& fusingNode) const; void redefineOutputMemory(const std::vector &newOutputShapes) override; void addFusedNode(const NodePtr &fusingNode) override; - const std::vector& getPrimitivesPriority() override; + const std::vector& getDefaultImplPriority() override; private: enum class zpType { @@ -105,12 +105,11 @@ private: void setPostOps(dnnl::primitive_attr &attr, const VectorDims &dims, bool useLegacyPostOps, bool initWeights = false); void SetPostOpsAndZeroPoints(std::vector &attrs); void filterSupportedDescriptors(); - bool isPossibleToSkipInitConfig(const dnnl::primitive_desc &desc) const; bool isNspcAvailable() const; InferenceEngine::Blob::Ptr createInternalBlob(InferenceEngine::SizeVector dims, size_t edgeNum, bool isGrouped = false); void updatePadding(); - MemoryDescPtr getSumMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it); + MemoryDescPtr getSumMemDesc(const dnnl::primitive_desc &primitive_desc_it); MemoryPtr getOutputMemory() const; VectorDims makeInputDummyShape(const Shape& inpShape) const; VectorDims outputStaticShape() const; diff --git a/src/plugins/intel_cpu/src/nodes/deconv.cpp b/src/plugins/intel_cpu/src/nodes/deconv.cpp index f1ed2e450c1..684c01dd0fc 100644 --- a/src/plugins/intel_cpu/src/nodes/deconv.cpp +++ b/src/plugins/intel_cpu/src/nodes/deconv.cpp @@ -1077,7 +1077,7 @@ void Deconvolution::createDescriptor(const std::vector &inputDesc } } -std::shared_ptr Deconvolution::getSrcMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) { +std::shared_ptr Deconvolution::getSrcMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const { if (idx == 2 && !withBiases) { return std::make_shared(InferenceEngine::Precision::I32, Shape(getInputShapeAtPort(2).getStaticDims())); } else if (idx > 0 && isInt8) { @@ -1086,15 +1086,15 @@ std::shared_ptr Deconvolution::getSrcMemDesc(dnnl::primitive_desc_it return std::make_shared(getOriginalInputPrecisionAtPort(idx), Shape(getInputShapeAtPort(idx).getStaticDims())); } - auto desc = idx > 0 ? primitive_desc_it.weights_desc(idx - 1) : isInt8 ? primitive_desc_it.src_desc(idx) : primitive_desc_it.diff_dst_desc(idx); + auto desc = idx > 0 ? prim_desc.weights_desc(idx - 1) : isInt8 ? prim_desc.src_desc(idx) : prim_desc.diff_dst_desc(idx); if (getInputShapeAtPort(idx).isDynamic()) { return DnnlExtensionUtils::makeUndefinedDesc(desc, getInputShapeAtPort(idx)); } return DnnlExtensionUtils::makeDescriptor(desc); } -std::shared_ptr Deconvolution::getDstMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) { - auto desc = isInt8 ? primitive_desc_it.dst_desc(idx) : primitive_desc_it.diff_src_desc(idx); +std::shared_ptr Deconvolution::getDstMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const { + auto desc = isInt8 ? prim_desc.dst_desc(idx) : prim_desc.diff_src_desc(idx); if (getOutputShapeAtPort(idx).isDynamic()) { return DnnlExtensionUtils::makeUndefinedDesc(desc, getOutputShapeAtPort(idx)); } diff --git a/src/plugins/intel_cpu/src/nodes/deconv.h b/src/plugins/intel_cpu/src/nodes/deconv.h index ea3024dfb12..0c24c966b49 100644 --- a/src/plugins/intel_cpu/src/nodes/deconv.h +++ b/src/plugins/intel_cpu/src/nodes/deconv.h @@ -34,8 +34,8 @@ public: return static_cast(getParentEdges().size()); } - std::shared_ptr getSrcMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) override; - std::shared_ptr getDstMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) override; + std::shared_ptr getSrcMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const override; + std::shared_ptr getDstMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const override; InferenceEngine::Precision getRuntimePrecision() const override; diff --git a/src/plugins/intel_cpu/src/nodes/eltwise.cpp b/src/plugins/intel_cpu/src/nodes/eltwise.cpp index c4ab140804e..746e560e44b 100644 --- a/src/plugins/intel_cpu/src/nodes/eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/eltwise.cpp @@ -2049,7 +2049,7 @@ void Eltwise::initSupportedPrimitiveDescriptors() { NodeConfig config; for (size_t i = 0; i < getParentEdges().size(); i++) { - BlockedMemoryDesc::CmpMask inputMask = BLOCKED_DESC_SKIP_OFFSET_MASK; + BlockedMemoryDesc::CmpMask inputMask = BlockedMemoryDesc::SKIP_OFFSET_MASK; PortConfig portConfig; // TODO [DS]: inplace if (!isDynamicNode()) @@ -2070,7 +2070,7 @@ void Eltwise::initSupportedPrimitiveDescriptors() { portConfig.constant(false); const auto &dstShape = getOutputShapeAtPort(0); - BlockedMemoryDesc::CmpMask outputMask = BLOCKED_DESC_SKIP_OFFSET_MASK; + BlockedMemoryDesc::CmpMask outputMask = BlockedMemoryDesc::SKIP_OFFSET_MASK; if (!isDynamicNode() && dstShape.getDims()[0] == 1) { outputMask.reset(0); // accepts any stride on the batch axis } @@ -2091,7 +2091,7 @@ void Eltwise::initSupportedPrimitiveDescriptors() { } auto factory = std::make_shared(eltwiseAttrs, srcMemoryDescs, dstMemoryDescs, - std::make_shared(context, getPrimitivesPriority())); + std::make_shared(context, getImplPriority())); return {config, impl_type, !factory->isEmpty() ? factory : nullptr}; } else { @@ -2332,7 +2332,7 @@ bool Eltwise::needPrepareParams() const { } void Eltwise::selectOptimalPrimitiveDescriptor() { - selectPreferPrimitiveDescriptor(getPrimitivesPriority(), true); + selectPreferPrimitiveDescriptor(getImplPriority(), true); } void Eltwise::execute(dnnl::stream strm) { diff --git a/src/plugins/intel_cpu/src/nodes/executors/executor.hpp b/src/plugins/intel_cpu/src/nodes/executors/executor.hpp index 4b61f32fa77..6b920dcd753 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/executor.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/executor.hpp @@ -48,12 +48,11 @@ public: typedef std::shared_ptr Ptr; typedef std::shared_ptr CPtr; - ExecutorContext(const GraphContext::CPtr graphContext, const std::vector& implPriorities) { - this->runtimeCache = graphContext->getParamsCache(); - this->scratchPad = graphContext->getScratchPad(); - this->engine = graphContext->getEngine(); - this->implPriorities = implPriorities; - } + ExecutorContext(const GraphContext::CPtr graphContext, const std::vector& implPriorities) + : runtimeCache(graphContext->getParamsCache()), + scratchPad(graphContext->getScratchPad()), + engine(graphContext->getEngine()), + implPriorities(implPriorities) {} MultiCacheWeakPtr getRuntimeCache() const { return runtimeCache; @@ -75,9 +74,9 @@ private: // weak_ptr is required to avoid cycle dependencies with MultiCache // since ExecutorContext is stored in Executor itself MultiCacheWeakPtr runtimeCache; - DnnlScratchPadPtr scratchPad = nullptr; + DnnlScratchPadPtr scratchPad; dnnl::engine engine; - std::vector implPriorities = {}; + std::vector implPriorities; }; class ExecutorFactory { @@ -92,4 +91,4 @@ using ExecutorFactoryPtr = std::shared_ptr; using ExecutorFactoryCPtr = std::shared_ptr; } // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/eye.cpp b/src/plugins/intel_cpu/src/nodes/eye.cpp index 01c08fd164d..1332942655e 100644 --- a/src/plugins/intel_cpu/src/nodes/eye.cpp +++ b/src/plugins/intel_cpu/src/nodes/eye.cpp @@ -62,8 +62,6 @@ Eye::Eye(const std::shared_ptr& op, const GraphContext::CPtr context) } void Eye::getSupportedDescriptors() { - if (!descs.empty()) - return; if (!one_of(getParentEdges().size(), 3u, 4u)) THROW_ERROR << errorPrefix << "has incorrect number of input edges: " << getParentEdges().size(); if (getChildEdges().empty()) diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp index 2ec558f5740..d9f9e1dc874 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp @@ -8,6 +8,7 @@ #include "input.h" #include "fake_quantize.h" #include "input.h" +#include "memory_desc/blocked_memory_desc.h" #include "reorder.h" #include "transformations/cpu_opset/common/op/fully_connected.hpp" #include "ngraph/opsets/opset1.hpp" @@ -505,44 +506,40 @@ bool FullyConnected::created() const { return getType() == Type::FullyConnected; } -const std::vector& FullyConnected::getPrimitivesPriority() { - std::vector priorities = { - impl_desc_type::unknown, - impl_desc_type::acl, - impl_desc_type::brgemm_sparse_avx512_amx, - impl_desc_type::brgemm_avx512_amx, - impl_desc_type::brgemm_avx512, - impl_desc_type::gemm_blas, - impl_desc_type::gemm_avx512, - impl_desc_type::gemm_avx2, - impl_desc_type::gemm_avx, - impl_desc_type::gemm_sse42, - impl_desc_type::gemm_any, - impl_desc_type::gemm, - impl_desc_type::jit_gemm, - impl_desc_type::jit_uni_dw, - impl_desc_type::jit_uni_1x1, - impl_desc_type::jit_uni, - impl_desc_type::jit_avx512_dw, - impl_desc_type::jit_avx512_1x1, - impl_desc_type::jit_avx512, - impl_desc_type::jit_avx2_dw, - impl_desc_type::jit_avx2_1x1, - impl_desc_type::jit_avx2, - impl_desc_type::jit_avx_dw, - impl_desc_type::jit_avx_1x1, - impl_desc_type::jit_avx, - impl_desc_type::jit_sse42_dw, - impl_desc_type::jit_sse42_1x1, - impl_desc_type::jit_sse42, - impl_desc_type::ref, +const std::vector& FullyConnected::getDefaultImplPriority() { + static const std::vector priorities = { + impl_desc_type::unknown, + impl_desc_type::acl, + impl_desc_type::brgemm_sparse_avx512_amx, + impl_desc_type::brgemm_avx512_amx, + impl_desc_type::brgemm_avx512, + impl_desc_type::gemm_blas, + impl_desc_type::gemm_avx512, + impl_desc_type::gemm_avx2, + impl_desc_type::gemm_avx, + impl_desc_type::gemm_sse42, + impl_desc_type::gemm_any, + impl_desc_type::gemm, + impl_desc_type::jit_gemm, + impl_desc_type::jit_uni_dw, + impl_desc_type::jit_uni_1x1, + impl_desc_type::jit_uni, + impl_desc_type::jit_avx512_dw, + impl_desc_type::jit_avx512_1x1, + impl_desc_type::jit_avx512, + impl_desc_type::jit_avx2_dw, + impl_desc_type::jit_avx2_1x1, + impl_desc_type::jit_avx2, + impl_desc_type::jit_avx_dw, + impl_desc_type::jit_avx_1x1, + impl_desc_type::jit_avx, + impl_desc_type::jit_sse42_dw, + impl_desc_type::jit_sse42_1x1, + impl_desc_type::jit_sse42, + impl_desc_type::ref, }; - for (const auto& impl : priorities) { - if (std::find(implPriorities.begin(), implPriorities.end(), impl) == implPriorities.end()) - implPriorities.push_back(impl); - } - return implPriorities; + return priorities; } // WA: creation DnnlMemoryDesc with format == any is prohibited @@ -639,53 +636,60 @@ void FullyConnected::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; - for (auto& desc : descs) { - primitive_desc_iterator itpd = desc; - while (static_cast(itpd)) { - // 3D FC requires implicit reshape so strides should be defined - auto supportsUndefStridesAndOffset = [&]() { - return getOutputShapeAtPort(0).getRank() == 2; - }; + // 3D FC requires implicit reshape so strides should be defined + auto supportsUndefStridesAndOffset = [&]() { + return getOutputShapeAtPort(0).getRank() == 2; + }; - NodeConfig config; - for (size_t i = 0; i < descInputNumbers(); i++) { - PortConfig portConfig; - portConfig.inPlace(-1); - portConfig.constant(false); - auto desc = getSrcMemDesc(itpd, i); - if (supportsUndefStridesAndOffset() && !(i == WEIGHTS_ID && useSparseWeights)) { - portConfig.setMemDesc(std::dynamic_pointer_cast(desc), BLOCKED_DESC_EMPTY_MASK); - } else { - portConfig.setMemDesc(desc); - } - config.inConfs.push_back(portConfig); - } + auto addSupportedPrimitiveDescriptor = [&](const dnnl::primitive_desc& prim_desc) { + std::vector inConfs, outConfs; + const int inPlaceOutPort = canBeInPlace() ? 0 : -1; - for (size_t i = 0; i < descOutputNumbers(); i++) { - PortConfig portConfig; - portConfig.inPlace(canBeInPlace() ? 0 : -1); - portConfig.constant(false); - auto desc = getDstMemDesc(itpd, i); - if (supportsUndefStridesAndOffset()) { - portConfig.setMemDesc(std::dynamic_pointer_cast(desc), BLOCKED_DESC_EMPTY_MASK); - } else { - portConfig.setMemDesc(desc); - } - config.outConfs.push_back(portConfig); - } + for (size_t i = 0; i < descInputNumbers(); i++) { + auto desc = getSrcMemDesc(prim_desc, i); + const auto inputBlockedMask = (supportsUndefStridesAndOffset() && !(i == WEIGHTS_ID && useSparseWeights)) ? + BlockedMemoryDesc::EMPTY_MASK : + BlockedMemoryDesc::FULL_MASK; - impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str()); - - supportedPrimitiveDescriptors.emplace_back(config, impl_type); - - if (!itpd.next_impl()) - break; + inConfs.emplace_back(desc, inputBlockedMask); } + + const auto outputBlockedMask = supportsUndefStridesAndOffset() ? BlockedMemoryDesc::EMPTY_MASK : BlockedMemoryDesc::FULL_MASK; + + for (size_t i = 0; i < descOutputNumbers(); i++) { + auto desc = getDstMemDesc(prim_desc, i); + + outConfs.emplace_back(desc, outputBlockedMask, inPlaceOutPort); + } + + const NodeConfig config(inConfs, outConfs); + const impl_desc_type impl_type = parse_impl_name(prim_desc.impl_info_str()); + + supportedPrimitiveDescriptors.emplace_back(config, impl_type); + }; + + for (auto& desc : descs) { + auto first_desc = dnnl::primitive_desc(DnnlExtensionUtils::clone_primitive_desc(desc.get())); + + const bool first_match = customImplPriorities.empty(); + DnnlExtensionUtils::for_each_implementation(desc, + first_match, + [&](impl_desc_type implType) { + return contains(getImplPriority(), implType); + }, + [&](dnnl::primitive_desc& desc) { + addSupportedPrimitiveDescriptor(desc); + }); + + // fallback. if none of the primitive types is present in the priority list just add first implementation + // @todo this fallback is not necessary if primitive priority list is filled correctly + if (supportedPrimitiveDescriptors.empty()) + addSupportedPrimitiveDescriptor(first_desc); } } -std::shared_ptr FullyConnected::getSrcMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) { - auto desc = idx > 0 ? primitive_desc_it.weights_desc(idx - 1) : primitive_desc_it.src_desc(idx); +std::shared_ptr FullyConnected::getSrcMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const { + auto desc = idx > 0 ? prim_desc.weights_desc(idx - 1) : prim_desc.src_desc(idx); if (getInputShapeAtPort(idx).getRank() == 3) { return std::make_shared( @@ -699,8 +703,8 @@ std::shared_ptr FullyConnected::getSrcMemDesc(dnnl::primitive_desc_i return DnnlExtensionUtils::makeDescriptor(desc); } -std::shared_ptr FullyConnected::getDstMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) { - auto desc = primitive_desc_it.dst_desc(idx); +std::shared_ptr FullyConnected::getDstMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const { + auto desc = prim_desc.dst_desc(idx); if (getOutputShapeAtPort(idx).getRank() == 3) { return std::make_shared( diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.h b/src/plugins/intel_cpu/src/nodes/fullyconnected.h index e661154c640..59ea9a9db60 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.h +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.h @@ -33,7 +33,7 @@ public: return getOutputShapeAtPort(0).getRank() == 3 ? 2 : 1; } - const std::vector& getPrimitivesPriority() override; + const std::vector& getDefaultImplPriority() override; void createDescriptor(const std::vector& inputDesc, const std::vector& outputDesc) override; @@ -44,8 +44,8 @@ public: void initSupportedPrimitiveDescriptors() override; void initOptimalPrimitiveDescriptor() override; void createPrimitive() override; - std::shared_ptr getSrcMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) override; - std::shared_ptr getDstMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) override; + std::shared_ptr getSrcMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const override; + std::shared_ptr getDstMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const override; InferenceEngine::Precision getRuntimePrecision() const override; diff --git a/src/plugins/intel_cpu/src/nodes/interpolate.cpp b/src/plugins/intel_cpu/src/nodes/interpolate.cpp index 195066dd061..1ffa599fc7c 100644 --- a/src/plugins/intel_cpu/src/nodes/interpolate.cpp +++ b/src/plugins/intel_cpu/src/nodes/interpolate.cpp @@ -2096,7 +2096,7 @@ void Interpolate::initSupportedPrimitiveDescriptors() { } auto factory = std::make_shared(interpAttrs, srcMemoryDescs, dstMemoryDescs, - std::make_shared(context, getPrimitivesPriority())); + std::make_shared(context, getImplPriority())); if (!factory->isEmpty()) { supportedPrimitiveDescriptors.push_back({config, implDetail, factory}); } diff --git a/src/plugins/intel_cpu/src/nodes/lrn.cpp b/src/plugins/intel_cpu/src/nodes/lrn.cpp index ba5ca8366b2..dfe704dfff0 100644 --- a/src/plugins/intel_cpu/src/nodes/lrn.cpp +++ b/src/plugins/intel_cpu/src/nodes/lrn.cpp @@ -151,14 +151,14 @@ void Lrn::getSupportedDescriptors() { } } -std::shared_ptr Lrn::getSrcMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) { +std::shared_ptr Lrn::getSrcMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const { if (idx > 0) { return std::make_shared(getOriginalInputPrecisionAtPort(idx), getInputShapeAtPort(idx)); } else { if (getInputShapeAtPort(idx).isDynamic()) { - return DnnlExtensionUtils::makeUndefinedDesc(primitive_desc_it.src_desc(idx), getInputShapeAtPort(idx)); + return DnnlExtensionUtils::makeUndefinedDesc(prim_desc.src_desc(idx), getInputShapeAtPort(idx)); } - return DnnlExtensionUtils::makeDescriptor(primitive_desc_it.src_desc(idx)); + return DnnlExtensionUtils::makeDescriptor(prim_desc.src_desc(idx)); } } diff --git a/src/plugins/intel_cpu/src/nodes/lrn.h b/src/plugins/intel_cpu/src/nodes/lrn.h index c1635261f70..84492cb53ef 100644 --- a/src/plugins/intel_cpu/src/nodes/lrn.h +++ b/src/plugins/intel_cpu/src/nodes/lrn.h @@ -25,7 +25,7 @@ public: size_t descInputNumbers() override { return static_cast(getOriginalInputsNumber()); } - std::shared_ptr getSrcMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) override; + std::shared_ptr getSrcMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const override; bool created() const override; bool canBeInPlace() const override { return false; diff --git a/src/plugins/intel_cpu/src/nodes/matmul.cpp b/src/plugins/intel_cpu/src/nodes/matmul.cpp index 7d12ecd4ec4..172d914bf07 100644 --- a/src/plugins/intel_cpu/src/nodes/matmul.cpp +++ b/src/plugins/intel_cpu/src/nodes/matmul.cpp @@ -506,39 +506,50 @@ void MatMul::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; - for (auto& desc : descs) { - auto itpd = desc; - while (itpd) { - NodeConfig config; - for (size_t i = 0; i < descInputNumbers(); i++) { - PortConfig portConfig; - portConfig.inPlace(-1); - portConfig.constant(false); - portConfig.setMemDesc(getSrcMemDesc(itpd, i)); + auto addSupportedPrimitiveDescriptor = [&](const dnnl::primitive_desc& prim_desc) { + std::vector inConfs, outConfs; + const int inPlaceOutPort = canBeInPlace() ? 0 : -1; - config.inConfs.push_back(portConfig); - } + for (size_t i = 0; i < descInputNumbers(); i++) { + auto desc = getSrcMemDesc(prim_desc, i); - for (size_t i = 0; i < descOutputNumbers(); i++) { - PortConfig portConfig; - portConfig.inPlace(canBeInPlace() ? 0 : -1); - portConfig.constant(false); - portConfig.setMemDesc(getDstMemDesc(itpd, i)); - - config.outConfs.push_back(portConfig); - } - - impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str()); - - supportedPrimitiveDescriptors.emplace_back(config, impl_type); - if (!itpd.next_impl()) - break; + inConfs.emplace_back(desc); } - } + + for (size_t i = 0; i < descOutputNumbers(); i++) { + auto desc = getDstMemDesc(prim_desc, i); + + outConfs.emplace_back(desc, BlockedMemoryDesc::FULL_MASK, inPlaceOutPort); + } + + const NodeConfig config(inConfs, outConfs); + const impl_desc_type impl_type = parse_impl_name(prim_desc.impl_info_str()); + + supportedPrimitiveDescriptors.emplace_back(config, impl_type); + }; + + for (auto& desc : descs) { + auto first_desc = dnnl::primitive_desc(DnnlExtensionUtils::clone_primitive_desc(desc.get())); + + const bool first_match = customImplPriorities.empty(); + DnnlExtensionUtils::for_each_implementation(desc, + first_match, + [&](impl_desc_type implType) { + return contains(getImplPriority(), implType); + }, + [&](dnnl::primitive_desc& desc) { + addSupportedPrimitiveDescriptor(desc); + }); + + // fallback. if none of the primitive types is present in the priority list just add first implementation + // @todo this fallback is not necessary if primitive priority list is filled correctly + if (supportedPrimitiveDescriptors.empty()) + addSupportedPrimitiveDescriptor(first_desc); + } } -MemoryDescPtr MatMul::getSrcMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) { - auto desc = idx > 0 ? primitive_desc_it.weights_desc(idx - 1): primitive_desc_it.src_desc(idx); +MemoryDescPtr MatMul::getSrcMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const { + auto desc = idx > 0 ? prim_desc.weights_desc(idx - 1): prim_desc.src_desc(idx); if (idx < 2) // inputs return std::make_shared( @@ -679,42 +690,39 @@ void MatMul::executeDynamicImpl(dnnl::stream strm) { execute(strm); } -const std::vector& MatMul::getPrimitivesPriority() { - std::vector priorities = { - impl_desc_type::unknown, - impl_desc_type::brgemm_avx512_amx, - impl_desc_type::brgemm_avx512, - impl_desc_type::gemm_acl, - impl_desc_type::gemm_blas, - impl_desc_type::gemm_avx512, - impl_desc_type::gemm_avx2, - impl_desc_type::gemm_avx, - impl_desc_type::gemm_sse42, - impl_desc_type::gemm_any, - impl_desc_type::gemm, - impl_desc_type::jit_gemm, - impl_desc_type::jit_uni_dw, - impl_desc_type::jit_uni_1x1, - impl_desc_type::jit_uni, - impl_desc_type::jit_avx512_dw, - impl_desc_type::jit_avx512_1x1, - impl_desc_type::jit_avx512, - impl_desc_type::jit_avx2_dw, - impl_desc_type::jit_avx2_1x1, - impl_desc_type::jit_avx2, - impl_desc_type::jit_avx_dw, - impl_desc_type::jit_avx_1x1, - impl_desc_type::jit_avx, - impl_desc_type::jit_sse42_dw, - impl_desc_type::jit_sse42_1x1, - impl_desc_type::jit_sse42, - impl_desc_type::ref, +const std::vector& MatMul::getDefaultImplPriority() { + static const std::vector priorities = { + impl_desc_type::unknown, + impl_desc_type::brgemm_avx512_amx, + impl_desc_type::brgemm_avx512, + impl_desc_type::gemm_acl, + impl_desc_type::gemm_blas, + impl_desc_type::gemm_avx512, + impl_desc_type::gemm_avx2, + impl_desc_type::gemm_avx, + impl_desc_type::gemm_sse42, + impl_desc_type::gemm_any, + impl_desc_type::gemm, + impl_desc_type::jit_gemm, + impl_desc_type::jit_uni_dw, + impl_desc_type::jit_uni_1x1, + impl_desc_type::jit_uni, + impl_desc_type::jit_avx512_dw, + impl_desc_type::jit_avx512_1x1, + impl_desc_type::jit_avx512, + impl_desc_type::jit_avx2_dw, + impl_desc_type::jit_avx2_1x1, + impl_desc_type::jit_avx2, + impl_desc_type::jit_avx_dw, + impl_desc_type::jit_avx_1x1, + impl_desc_type::jit_avx, + impl_desc_type::jit_sse42_dw, + impl_desc_type::jit_sse42_1x1, + impl_desc_type::jit_sse42, + impl_desc_type::ref, }; - for (const auto& impl : priorities) { - if (std::find(implPriorities.begin(), implPriorities.end(), impl) == implPriorities.end()) - implPriorities.push_back(impl); - } - return implPriorities; + + return priorities; } } // namespace node } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/nodes/matmul.h b/src/plugins/intel_cpu/src/nodes/matmul.h index ed4ec55dfff..dae1730452a 100644 --- a/src/plugins/intel_cpu/src/nodes/matmul.h +++ b/src/plugins/intel_cpu/src/nodes/matmul.h @@ -24,7 +24,7 @@ public: void createDescriptor(const std::vector& inputDesc, const std::vector& outputDesc) override; void initSupportedPrimitiveDescriptors() override; - MemoryDescPtr getSrcMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) override; + MemoryDescPtr getSrcMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const override; bool canFuse(const NodePtr& node) const override; bool created() const override; @@ -42,7 +42,7 @@ public: void executeDynamicImpl(dnnl::stream strm) override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; - const std::vector& getPrimitivesPriority() override; + const std::vector& getDefaultImplPriority() override; protected: AttrPtr initPrimitiveAttr() override; diff --git a/src/plugins/intel_cpu/src/nodes/mvn.cpp b/src/plugins/intel_cpu/src/nodes/mvn.cpp index ac479a0042f..cf5ce1a520a 100644 --- a/src/plugins/intel_cpu/src/nodes/mvn.cpp +++ b/src/plugins/intel_cpu/src/nodes/mvn.cpp @@ -1219,7 +1219,7 @@ void MVN::initSupportedPrimitiveDescriptors() { } auto factory = std::make_shared(mvnAttrs, srcMemoryDescs, dstMemoryDescs, - std::make_shared(context, getPrimitivesPriority())); + std::make_shared(context, getImplPriority())); if (!factory->isEmpty()) { supportedPrimitiveDescriptors.push_back({config, impl_type, factory}); } diff --git a/src/plugins/intel_cpu/src/nodes/node_config.h b/src/plugins/intel_cpu/src/nodes/node_config.h index abf6b145ea8..257f7ffdb64 100644 --- a/src/plugins/intel_cpu/src/nodes/node_config.h +++ b/src/plugins/intel_cpu/src/nodes/node_config.h @@ -4,6 +4,7 @@ #pragma once +#include #include "memory_desc/cpu_memory_desc.h" #include "memory_desc/blocked_memory_desc.h" @@ -77,29 +78,30 @@ public: private: BlockedMemoryDescPtr _memDesc; - CmpMask _cmpMask = BLOCKED_DESC_FULL_MASK; + CmpMask _cmpMask = BlockedMemoryDesc::FULL_MASK; }; class PortConfig { public: PortConfig() = default; - PortConfig(const PortConfig& rhs) { - this->_constant = rhs._constant; - this->_inPlacePort = rhs._inPlacePort; - if (rhs._desc) { - this->_desc = rhs._desc; - } - } + PortConfig(MemoryDescPtr desc, + BlockedMemoryDesc::CmpMask cmpMask = BlockedMemoryDesc::FULL_MASK, + int inPlacePort = -1, + bool isConstant = false) + : _desc(createPortDesc(desc, cmpMask)), + _inPlacePort(inPlacePort), + _constant(isConstant) {} - PortConfig& operator=(const PortConfig& rhs) { - this->_constant = rhs._constant; - this->_inPlacePort = rhs._inPlacePort; - if (rhs._desc) { - this->_desc = rhs._desc; - } - return *this; - } + // prevent implicit convertion of cmpMask + PortConfig(MemoryDescPtr desc, + int cmpMask, + int inPlacePort = -1, + bool isConstant = false) = delete; + + PortConfig(const PortConfig& rhs) = default; + + PortConfig& operator=(const PortConfig& rhs) = default; PortConfig(PortConfig&& rhs) = default; PortConfig& operator=(PortConfig&& rhs) = default; @@ -124,29 +126,42 @@ public: return _desc->getMemDesc(); } - void setMemDesc(MemoryDescPtr desc) { - if (desc->getType() & Blocked) { - setMemDesc(std::dynamic_pointer_cast(desc), BLOCKED_DESC_FULL_MASK); - } else { - _desc = std::make_shared(desc); - } - } - - void setMemDesc(BlockedMemoryDescPtr desc, BlockedMemoryDesc::CmpMask cmpMask) { - _desc = std::make_shared(desc, cmpMask); - } - PortDescBasePtr getPortDesc() const { return _desc; } + void setMemDesc(MemoryDescPtr desc) { + _desc = createPortDesc(desc, BlockedMemoryDesc::FULL_MASK); + } + + void setMemDesc(BlockedMemoryDescPtr desc, BlockedMemoryDesc::CmpMask cmpMask) { + _desc = createPortDesc(desc, cmpMask); + } + private: - bool _constant = false; - int _inPlacePort = -1; + PortDescBasePtr createPortDesc(MemoryDescPtr desc, BlockedMemoryDesc::CmpMask cmpMask) { + if (desc->getType() & Blocked) + return createPortDesc(std::dynamic_pointer_cast(desc), cmpMask); + + return std::make_shared(desc); + } + + PortDescBasePtr createPortDesc(BlockedMemoryDescPtr desc, BlockedMemoryDesc::CmpMask cmpMask) { + return std::make_shared(desc, cmpMask); + } + PortDescBasePtr _desc; + int _inPlacePort = -1; + bool _constant = false; }; struct NodeConfig { + NodeConfig() = default; + + NodeConfig(std::vector inConfs, std::vector outConfs) + : inConfs(std::move(inConfs)), outConfs(std::move(outConfs)) + {} + std::vector inConfs; std::vector outConfs; }; diff --git a/src/plugins/intel_cpu/src/nodes/non_zero.cpp b/src/plugins/intel_cpu/src/nodes/non_zero.cpp index 7704fb7da28..1abbedaa259 100644 --- a/src/plugins/intel_cpu/src/nodes/non_zero.cpp +++ b/src/plugins/intel_cpu/src/nodes/non_zero.cpp @@ -46,8 +46,6 @@ NonZero::NonZero(const std::shared_ptr& op, const GraphContext::CP } void NonZero::getSupportedDescriptors() { - if (!descs.empty()) - return; if (getParentEdges().size() != 1) IE_THROW() << errorPrefix << "has incorrect number of input edges: " << getParentEdges().size(); if (!getChildEdges().size()) diff --git a/src/plugins/intel_cpu/src/nodes/pooling.cpp b/src/plugins/intel_cpu/src/nodes/pooling.cpp index 97958bb3de6..3509d09a079 100644 --- a/src/plugins/intel_cpu/src/nodes/pooling.cpp +++ b/src/plugins/intel_cpu/src/nodes/pooling.cpp @@ -16,6 +16,7 @@ #include #include #include "memory_desc/dnnl_blocked_memory_desc.h" +#include "nodes/node_config.h" #include // to access and change C pooling primitive desc internal padding field @@ -576,16 +577,15 @@ void Pooling::createDescriptor(const std::vector &inputDesc, const auto& out_candidate = dnnlOutDesc.getDnnlDesc(); auto desc = createDescriptorInternal(in_candidate, out_candidate, getPoolingAlgorithm()); - descs.emplace_back(desc); + + if (desc) + descs.emplace_back(desc); } void Pooling::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; - dnnl::primitive_attr attr; - setPostOps(attr); - if (useACL) { auto& creatorsMap = BlockedDescCreator::getCommonCreators(); auto pushDesc = [&](LayoutType format) { @@ -599,65 +599,74 @@ void Pooling::initSupportedPrimitiveDescriptors() { creatorsMap.at(format)->createSharedDesc(getOriginalOutputPrecisionAtPort(0), getOutputShapeAtPort(0))); std::vector srcMemoryDescs; - for (size_t i = 0; i < config.inConfs.size(); i++) { - srcMemoryDescs.push_back(config.inConfs[i].getMemDesc()); + for (const auto& inConf : config.inConfs) { + srcMemoryDescs.push_back(inConf.getMemDesc()); } std::vector dstMemoryDescs; - for (size_t i = 0; i < config.outConfs.size(); i++) { - dstMemoryDescs.push_back(config.outConfs[i].getMemDesc()); + for (const auto& outConf : config.outConfs) { + dstMemoryDescs.push_back(outConf.getMemDesc()); } auto factory = std::make_shared( poolingAttrs, srcMemoryDescs, dstMemoryDescs, - std::make_shared(context, getPrimitivesPriority())); + std::make_shared(context, getImplPriority())); supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::undef, factory); }; + pushDesc(LayoutType::ncsp); - } else { - for (auto& desc : descs) { - auto itpd = desc; - while (static_cast(itpd)) { - NodeConfig config; - for (size_t i = 0; i < descInputNumbers(); i++) { - PortConfig dataConfig; - dataConfig.inPlace(-1); - dataConfig.constant(false); - dataConfig.setMemDesc(getSrcMemDesc(itpd, i)); + return; + } - config.inConfs.push_back(dataConfig); - } + auto addSupportedPrimitiveDescriptor = [&](const dnnl::primitive_desc& prim_desc) { + std::vector inConfs, outConfs; + const int inPlaceOutPort = canBeInPlace() ? 0 : -1; - for (size_t i = 0; i < descOutputNumbers(); i++) { - PortConfig dataConfig; - dataConfig.inPlace(canBeInPlace() ? 0 : -1); - dataConfig.constant(false); - dataConfig.setMemDesc(getDstMemDesc(itpd, i)); - - config.outConfs.push_back(dataConfig); - } - - // CPU plugin doesn't support second output of MaxPool-8, but anyway we should have out config for second port as stub - if (isMaxPool8) { - auto& creatorsMap = BlockedDescCreator::getCommonCreators(); - PortConfig dataConfig; - dataConfig.inPlace(-1); - dataConfig.constant(false); - dataConfig.setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc(config.outConfs.front().getMemDesc()->getPrecision(), - getOutputShapeAtPort(1))); - - config.outConfs.push_back(dataConfig); - } - - impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str()); - - supportedPrimitiveDescriptors.emplace_back(config, impl_type); - if (!itpd.next_impl()) - break; - } + for (size_t i = 0; i < descInputNumbers(); i++) { + auto desc = getSrcMemDesc(prim_desc, i); + inConfs.emplace_back(desc); } + + for (size_t i = 0; i < descOutputNumbers(); i++) { + auto desc = getDstMemDesc(prim_desc, i); + // PortConfig in{desc, inPlaceOutPort}; + outConfs.emplace_back(desc, BlockedMemoryDesc::FULL_MASK, inPlaceOutPort); + } + + // CPU plugin doesn't support second output of MaxPool-8, but anyway we should have out config for second port as stub + if (isMaxPool8) { + const auto& creatorsMap = BlockedDescCreator::getCommonCreators(); + const auto outputPrecision = outConfs.front().getMemDesc()->getPrecision(); + auto desc = creatorsMap.at(LayoutType::ncsp)->createSharedDesc(outputPrecision, getOutputShapeAtPort(1)); + + outConfs.emplace_back(desc); + } + + const NodeConfig config(inConfs, outConfs); + const impl_desc_type impl_type = parse_impl_name(prim_desc.impl_info_str()); + + supportedPrimitiveDescriptors.emplace_back(config, impl_type); + }; + + for (auto& desc : descs) { + auto first_desc = dnnl::primitive_desc(DnnlExtensionUtils::clone_primitive_desc(desc.get())); + + const bool first_match = customImplPriorities.empty(); + DnnlExtensionUtils::for_each_implementation(desc, + first_match, + [&](impl_desc_type implType) { + return contains(getImplPriority(), implType); + }, + [&](dnnl::primitive_desc& desc) { + addSupportedPrimitiveDescriptor(desc); + }); + + // fallback. if none of the primitive types is present in the priority list just add first implementation + // @todo this fallback is not necessary if primitive priority list is filled correctly + if (supportedPrimitiveDescriptors.empty()) + addSupportedPrimitiveDescriptor(first_desc); } } diff --git a/src/plugins/intel_cpu/src/nodes/reduce.cpp b/src/plugins/intel_cpu/src/nodes/reduce.cpp index 1164227643c..ce52820fd1b 100644 --- a/src/plugins/intel_cpu/src/nodes/reduce.cpp +++ b/src/plugins/intel_cpu/src/nodes/reduce.cpp @@ -1771,9 +1771,6 @@ Reduce::Reduce(const std::shared_ptr& op, const GraphContext::CPtr } void Reduce::getSupportedDescriptors() { - if (!descs.empty()) - return; - if (getParentEdges().size() != 2) IE_THROW() << errorPrefix << " gets incorrect number of input edges!"; if (getChildEdges().empty()) @@ -1858,7 +1855,7 @@ void Reduce::initSupportedPrimitiveDescriptors() { } auto factory = std::make_shared(reduceAttrs, srcMemoryDescs, dstMemoryDescs, - std::make_shared(context, getPrimitivesPriority())); + std::make_shared(context, getImplPriority())); if (!factory->isEmpty()) { supportedPrimitiveDescriptors.push_back({config, impl_type, factory}); } @@ -3190,4 +3187,4 @@ bool Reduce::created() const { } // namespace node } // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/reorder.cpp b/src/plugins/intel_cpu/src/nodes/reorder.cpp index 6e3bc53c02c..ef2d4ca4d67 100644 --- a/src/plugins/intel_cpu/src/nodes/reorder.cpp +++ b/src/plugins/intel_cpu/src/nodes/reorder.cpp @@ -253,9 +253,10 @@ void Reorder::createReorderPrimitive(const dnnl::memory::desc& srcDesc, #endif } -const std::vector& Reorder::getPrimitivesPriority() { - implPriorities = {impl_desc_type::reorder}; - return implPriorities; +const std::vector& Reorder::getDefaultImplPriority() { + static const std::vector priorities = {impl_desc_type::reorder}; + + return priorities; } bool Reorder::created() const { diff --git a/src/plugins/intel_cpu/src/nodes/reorder.h b/src/plugins/intel_cpu/src/nodes/reorder.h index 4b105ad46c1..f9517912855 100644 --- a/src/plugins/intel_cpu/src/nodes/reorder.h +++ b/src/plugins/intel_cpu/src/nodes/reorder.h @@ -24,7 +24,7 @@ public: void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; - const std::vector& getPrimitivesPriority() override; + const std::vector& getDefaultImplPriority() override; bool isExecutable() const override; diff --git a/src/plugins/intel_cpu/src/nodes/rnn.cpp b/src/plugins/intel_cpu/src/nodes/rnn.cpp index 7b7f9980c2e..308f39efaff 100644 --- a/src/plugins/intel_cpu/src/nodes/rnn.cpp +++ b/src/plugins/intel_cpu/src/nodes/rnn.cpp @@ -956,7 +956,7 @@ void RNN::fillDescs() { wDescs, *attr); - descs.push_back(desc); + descs.emplace_back(desc); } void RNN::createDescriptor(const std::vector &inputDesc, @@ -1109,11 +1109,13 @@ void RNN::prepareParams() { primArgs[DNNL_ARG_SCRATCHPAD] = scratchpadMem->GetPrimitive(); } -std::shared_ptr RNN::getSrcMemDesc(dnnl::primitive_desc_iterator& primitive_desc_it, size_t idx) { +std::shared_ptr RNN::getSrcMemDesc(const dnnl::primitive_desc& prim_desc, size_t idx) const { + (void) prim_desc; return supportedPrimitiveDescriptors[0].getConfig().inConfs[idx].getMemDesc(); } -std::shared_ptr RNN::getDstMemDesc(dnnl::primitive_desc_iterator& primitive_desc_it, size_t idx) { +std::shared_ptr RNN::getDstMemDesc(const dnnl::primitive_desc& prim_desc, size_t idx) const { + (void) prim_desc; return supportedPrimitiveDescriptors[0].getConfig().outConfs[idx].getMemDesc(); } diff --git a/src/plugins/intel_cpu/src/nodes/rnn.h b/src/plugins/intel_cpu/src/nodes/rnn.h index 5e324a1bf2d..d16bcd10c50 100644 --- a/src/plugins/intel_cpu/src/nodes/rnn.h +++ b/src/plugins/intel_cpu/src/nodes/rnn.h @@ -25,8 +25,8 @@ public: static bool isCell(const std::shared_ptr& op); static bool testNativeOrder(const std::shared_ptr& op); void getSupportedDescriptors() override; - std::shared_ptr getSrcMemDesc(dnnl::primitive_desc_iterator& primitive_desc_it, size_t idx) override; - std::shared_ptr getDstMemDesc(dnnl::primitive_desc_iterator& primitive_desc_it, size_t idx) override; + std::shared_ptr getSrcMemDesc(const dnnl::primitive_desc& prim_desc, size_t idx) const override; + std::shared_ptr getDstMemDesc(const dnnl::primitive_desc& prim_desc, size_t idx) const override; bool created() const override; void createDescriptor(const std::vector& inputDesc, const std::vector& outputDesc) override; diff --git a/src/plugins/intel_cpu/src/nodes/roi_align.cpp b/src/plugins/intel_cpu/src/nodes/roi_align.cpp index 04a2d1f7953..02431e43ec9 100644 --- a/src/plugins/intel_cpu/src/nodes/roi_align.cpp +++ b/src/plugins/intel_cpu/src/nodes/roi_align.cpp @@ -705,9 +705,6 @@ ROIAlign::ROIAlign(const std::shared_ptr& op, const GraphContext:: } void ROIAlign::getSupportedDescriptors() { - if (!descs.empty()) - return; - if (getParentEdges().size() != 3) IE_THROW() << errorPrefix << "has incorrect number of input edges: " << getParentEdges().size(); if (getChildEdges().empty()) diff --git a/src/plugins/intel_cpu/src/nodes/roi_pooling.cpp b/src/plugins/intel_cpu/src/nodes/roi_pooling.cpp index b082635f2a3..a91123fb256 100644 --- a/src/plugins/intel_cpu/src/nodes/roi_pooling.cpp +++ b/src/plugins/intel_cpu/src/nodes/roi_pooling.cpp @@ -407,9 +407,6 @@ ROIPooling::ROIPooling(const std::shared_ptr& op, const GraphConte } void ROIPooling::getSupportedDescriptors() { - if (!descs.empty()) - return; - if (getParentEdges().size() != 2) IE_THROW() << errorPrefix << "has incorrect number of input edges: " << getParentEdges().size(); if (getChildEdges().empty()) diff --git a/src/plugins/intel_cpu/src/nodes/shapeof.cpp b/src/plugins/intel_cpu/src/nodes/shapeof.cpp index 601b83c7b06..6c4d76b41c7 100644 --- a/src/plugins/intel_cpu/src/nodes/shapeof.cpp +++ b/src/plugins/intel_cpu/src/nodes/shapeof.cpp @@ -68,8 +68,6 @@ ShapeOf::ShapeOf(const std::shared_ptr& op, const GraphContext::CP } void ShapeOf::getSupportedDescriptors() { - if (!descs.empty()) - return; if (getParentEdges().size() != 1) IE_THROW() << errorPrefix << "has incorrect number of input edges: " << getParentEdges().size(); if (getChildEdges().empty()) diff --git a/src/plugins/intel_cpu/src/nodes/softmax.cpp b/src/plugins/intel_cpu/src/nodes/softmax.cpp index 3238116815b..91e9cc610a4 100644 --- a/src/plugins/intel_cpu/src/nodes/softmax.cpp +++ b/src/plugins/intel_cpu/src/nodes/softmax.cpp @@ -124,7 +124,7 @@ void SoftMax::initOptimalPrimitiveDescriptor() { auto config = selected_pd->getConfig(); if (isDynamicNode()) { auto outMemDesc = config.outConfs[0].getMemDesc(); - config.outConfs[0].setMemDesc(std::dynamic_pointer_cast(outMemDesc), BLOCKED_DESC_FULL_MASK); + config.outConfs[0].setMemDesc(std::dynamic_pointer_cast(outMemDesc), BlockedMemoryDesc::FULL_MASK); } else { if (config.inConfs.size() != 1 || config.outConfs.size() != 1 || (config.inConfs[0].getMemDesc()->isDefined() && @@ -155,7 +155,8 @@ void SoftMax::createDescriptor(const std::vector &inputDesc, *attr, true); - descs.push_back(desc); + if (desc) + descs.emplace_back(desc); } void SoftMax::prepareParams() { diff --git a/src/plugins/intel_cpu/src/nodes/split.cpp b/src/plugins/intel_cpu/src/nodes/split.cpp index 1476c373ab2..fe4cb99b753 100644 --- a/src/plugins/intel_cpu/src/nodes/split.cpp +++ b/src/plugins/intel_cpu/src/nodes/split.cpp @@ -178,7 +178,7 @@ void Split::initSupportedPrimitiveDescriptors() { SizeVector strides(numOfDim); strides.back() = 1lu; size_t offset = Shape::UNDEFINED_DIM; - BlockedMemoryDesc::CmpMask mask = BLOCKED_DESC_SKIP_OFFSET_MASK; // accepts any offset + BlockedMemoryDesc::CmpMask mask = BlockedMemoryDesc::SKIP_OFFSET_MASK; // accepts any offset for (size_t i = 2; i <= numOfDim; i++) { if (numOfDim - i < axis) { @@ -363,14 +363,16 @@ void Split::initOptimalPrimitiveDescriptor() { auto outBlockingDesc = oldDesc->as(); const auto& shape = outBlockingDesc->getShape(); const auto& blkDims = outBlockingDesc->getBlockDims(); - config.outConfs[i].setMemDesc(std::make_shared(outBlockingDesc->getPrecision(), - shape, - blkDims, - outBlockingDesc->getOrder(), - firstInBlockingDesc->getOffsetPadding() + offset, - firstInBlockingDesc->getOffsetPaddingToData(), - (shape.hasZeroDims() ? VectorDims(blkDims.size(), 0) : - firstInBlockingDesc->getStrides())), BLOCKED_DESC_FULL_MASK); + config.outConfs[i].setMemDesc(std::make_shared( + outBlockingDesc->getPrecision(), + shape, + blkDims, + outBlockingDesc->getOrder(), + firstInBlockingDesc->getOffsetPadding() + offset, + firstInBlockingDesc->getOffsetPaddingToData(), + (shape.hasZeroDims() ? VectorDims(blkDims.size(), 0) : + firstInBlockingDesc->getStrides())), + BlockedMemoryDesc::FULL_MASK); size_t axisSize = 1; for (size_t j = axis; j < outBlockingDesc->getBlockDims().size(); j++) { @@ -398,7 +400,7 @@ void Split::selectOptimalPrimitiveDescriptor() { // Enforce the reference implementation for the planar layout if the implementation is in the impl priorities list. // This is needed mostly for the testing purposes, since for the planar layout Split works always in place, we need to enforce // the reference implementation when it is selected in a test to test that piece of code. - if (!implPriorities.empty() && implPriorities[0] == impl_desc_type::ref) { + if (!customImplPriorities.empty() && customImplPriorities[0] == impl_desc_type::ref) { for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); ++i) { auto& pd = supportedPrimitiveDescriptors[i]; if (pd.getConfig().inConfs[0].getMemDesc()->hasLayoutType(LayoutType::ncsp) && diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index 044f1dbea73..68c05f12b38 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -191,7 +191,7 @@ void Snippet::initSupportedPrimitiveDescriptors() { const auto equalPrecisions = getOriginalOutputPrecisions().size() == 1 && precision == getOriginalOutputPrecisionAtPort(0); - BlockedMemoryDesc::CmpMask inputMask = BLOCKED_DESC_SKIP_OFFSET_MASK; + BlockedMemoryDesc::CmpMask inputMask = BlockedMemoryDesc::SKIP_OFFSET_MASK; PortConfig portConfig; portConfig.inPlace((!i && canBeInPlace() && equalPrecisions) ? 0 : -1); portConfig.constant(false); @@ -207,7 +207,7 @@ void Snippet::initSupportedPrimitiveDescriptors() { if (supportedPrecisions.count(precision) == 0) IE_THROW() << "Subgraph node with name `" << getName() << "` doesn't support " << precision << " precision."; - BlockedMemoryDesc::CmpMask outputMask = BLOCKED_DESC_SKIP_OFFSET_MASK; + BlockedMemoryDesc::CmpMask outputMask = BlockedMemoryDesc::SKIP_OFFSET_MASK; PortConfig portConfig; portConfig.inPlace(-1); portConfig.constant(false); @@ -235,7 +235,7 @@ void Snippet::initSupportedPrimitiveDescriptors() { } void Snippet::selectOptimalPrimitiveDescriptor() { - selectPreferPrimitiveDescriptor(getPrimitivesPriority(), true); + selectPreferPrimitiveDescriptor(getImplPriority(), true); } InferenceEngine::Precision Snippet::getRuntimePrecision() const { std::vector inputPrecisions; diff --git a/src/plugins/intel_cpu/src/onednn/iml_type_mapper.cpp b/src/plugins/intel_cpu/src/onednn/iml_type_mapper.cpp index 8f33b058b9a..b14768462f7 100644 --- a/src/plugins/intel_cpu/src/onednn/iml_type_mapper.cpp +++ b/src/plugins/intel_cpu/src/onednn/iml_type_mapper.cpp @@ -3,6 +3,7 @@ // #include "iml_type_mapper.h" +#include namespace ov { namespace intel_cpu { @@ -122,5 +123,9 @@ const char* impl_type_to_string(impl_desc_type type) { return "unknown"; } +bool contains(const std::vector& priorities, const impl_desc_type impl_type_str) { + return std::find(priorities.begin(), priorities.end(), impl_type_str) != priorities.end(); +} + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/onednn/iml_type_mapper.h b/src/plugins/intel_cpu/src/onednn/iml_type_mapper.h index d91b6c6e139..7176abf38e3 100644 --- a/src/plugins/intel_cpu/src/onednn/iml_type_mapper.h +++ b/src/plugins/intel_cpu/src/onednn/iml_type_mapper.h @@ -5,6 +5,7 @@ #pragma once #include +#include namespace ov { namespace intel_cpu { @@ -101,6 +102,7 @@ enum impl_desc_type { const char * impl_type_to_string(impl_desc_type type); impl_desc_type parse_impl_name(std::string impl_desc_name); +bool contains(const std::vector& priorities, const impl_desc_type impl_type_str); } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/utils/debug_caps_config.h b/src/plugins/intel_cpu/src/utils/debug_caps_config.h index 64ceaa3b740..e2b7c4cacc2 100644 --- a/src/plugins/intel_cpu/src/utils/debug_caps_config.h +++ b/src/plugins/intel_cpu/src/utils/debug_caps_config.h @@ -9,6 +9,7 @@ #include #include +#include namespace ov { namespace intel_cpu { @@ -79,13 +80,13 @@ public: }; struct PropertyGroup { - virtual std::vector getPropertySetters(void) = 0; + virtual std::vector getPropertySetters() = 0; void parseAndSet(const std::string& str) { const auto& options = ov::util::split(str, ' '); const auto& propertySetters = getPropertySetters(); bool failed = false; - auto getHelp = [propertySetters] (void) { + auto getHelp = [propertySetters]() { std::string help; for (const auto& property : propertySetters) help.append('\t' + property->getPropertyName() + "=<" + property->getPropertyValueDescription() + ">\n"); @@ -118,7 +119,7 @@ public: struct : PropertyGroup { TransformationFilter transformations; - std::vector getPropertySetters(void) override { + std::vector getPropertySetters() override { return { transformations.getPropertySetter() }; } } disable; @@ -128,7 +129,7 @@ public: IrFormatFilter format = { 1 << IrFormatFilter::Xml }; TransformationFilter transformations; - std::vector getPropertySetters(void) override { + std::vector getPropertySetters() override { return { PropertySetterPtr(new StringPropertySetter("dir", dir, "path to dumped IRs")), format.getPropertySetter(), transformations.getPropertySetter() }; @@ -138,23 +139,29 @@ public: private: struct PropertySetter { virtual bool parseAndSet(const std::string& str) = 0; - virtual std::string getPropertyValueDescription(void) const = 0; + virtual std::string getPropertyValueDescription() const = 0; - PropertySetter(const std::string&& name) : propertyName(name) {} - const std::string& getPropertyName(void) const { return propertyName; } + PropertySetter(std::string name) : propertyName(std::move(name)) {} + + virtual ~PropertySetter() = default; + + const std::string& getPropertyName() const { return propertyName; } private: const std::string propertyName; }; struct StringPropertySetter : PropertySetter { - StringPropertySetter(const std::string&& name, std::string& ref, const std::string&& valueDescription) - : PropertySetter(std::move(name)), property(ref), propertyValueDescription(valueDescription) {} + StringPropertySetter(const std::string& name, std::string& ref, const std::string&& valueDescription) + : PropertySetter(name), property(ref), propertyValueDescription(valueDescription) {} + + ~StringPropertySetter() override = default; + bool parseAndSet(const std::string& str) override { property = str; return true; } - std::string getPropertyValueDescription(void) const override { return propertyValueDescription; } + std::string getPropertyValueDescription() const override { return propertyValueDescription; } private: std::string& property; @@ -168,8 +175,11 @@ private: std::vector bits; }; - BitsetFilterPropertySetter(const std::string&& name, std::bitset& ref, const std::vector&& tokens) - : PropertySetter(std::move(name)), property(ref), propertyTokens(tokens) {} + BitsetFilterPropertySetter(const std::string& name, std::bitset& ref, const std::vector&& tokens) + : PropertySetter(name), property(ref), propertyTokens(tokens) {} + + ~BitsetFilterPropertySetter() override = default; + bool parseAndSet(const std::string& str) override { const auto& tokens = str.empty() ? std::vector{"all"} : ov::util::split(ov::util::to_lower(str), ','); @@ -188,7 +198,7 @@ private: } return true; } - std::string getPropertyValueDescription(void) const override { + std::string getPropertyValueDescription() const override { std::string supportedTokens = "comma separated filter tokens: "; for (size_t i = 0; i < propertyTokens.size(); i++) { if (i) diff --git a/src/plugins/intel_cpu/src/utils/ngraph_utils.hpp b/src/plugins/intel_cpu/src/utils/ngraph_utils.hpp index 7d9dbf24cbd..b9743971e20 100644 --- a/src/plugins/intel_cpu/src/utils/ngraph_utils.hpp +++ b/src/plugins/intel_cpu/src/utils/ngraph_utils.hpp @@ -19,7 +19,7 @@ inline std::string getRTInfoValue(const std::map& rtInfo, } } -inline std::string getPrimitivesPriorityValue(const std::shared_ptr &node) { +inline std::string getImplPriorityValue(const std::shared_ptr &node) { const auto &rtInfo = node->get_rt_info(); auto it_info = rtInfo.find(ov::PrimitivesPriority::get_type_info_static()); diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/convolution.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/convolution.cpp index 44dc9771e8e..32b7c00fc2e 100755 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/convolution.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/convolution.cpp @@ -507,6 +507,23 @@ INSTANTIATE_TEST_SUITE_P(smoke_Conv_1D_GEMM_FP32, ConvolutionLayerCPUTest, ::testing::Values(cpuEmptyPluginConfig)), ConvolutionLayerCPUTest::getTestCaseName); +// Verify that even if primitive is missed in custom priority list there is still a fallback to the default priority list +const auto conv_gemm_1D_improperPriorityList = CPUSpecificParams{{ncw}, {ncw}, {"unknown"}, "jit_gemm"}; + +INSTANTIATE_TEST_SUITE_P(smoke_Conv_1D_GEMM_FP32_ImproperPriorityList, ConvolutionLayerCPUTest, + ::testing::Combine( + ::testing::Combine( + convParams_ExplicitPadding_GEMM_1D, + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::undefined), + ::testing::Values(ElementType::undefined), + ::testing::ValuesIn(inShapesGemm1D), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::ValuesIn(filterCPUInfoForDevice({conv_gemm_1D})), + ::testing::Values(emptyFusingSpec), + ::testing::Values(cpuEmptyPluginConfig)), + ConvolutionLayerCPUTest::getTestCaseName); + INSTANTIATE_TEST_SUITE_P(smoke_Conv_1D_GEMM_BF16, ConvolutionLayerCPUTest, ::testing::Combine( ::testing::Combine( diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/group_convolution.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/group_convolution.cpp index e94cab5c537..8324f4452da 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/group_convolution.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/group_convolution.cpp @@ -1914,11 +1914,11 @@ INSTANTIATE_TEST_SUITE_P(smoke_JIT_AVX512_DW_GroupConv, GroupConvolutionLayerCPU /* ============= brgemm GroupConvolution test, expect fallback to other implementation ============= */ const std::vector CPUParams_Fallback_Brgemm_2D = { - conv_avx512_2D_nspc_brgconv, - conv_avx512_2D_nspc_brgconv_amx + CPUSpecificParams{{nhwc}, {nhwc}, {/* non-brgconv_avx512 is expected */}, "brgconv_avx512"}, + CPUSpecificParams{{nhwc}, {nhwc}, {/* non-brgconv_avx512_amx is expected */}, "brgconv_avx512_amx"}, }; const std::vector CPUParams_Fallback_Brgemm_1D_Small_Shape = { - conv_avx512_1D_nspc_brgconv_amx + CPUSpecificParams{{nwc}, {nwc}, {/* non-brgconv_avx512_amx is expected */}, "brgconv_avx512_amx"} }; const std::vector BRGEMM_EXPECT_FALLBACK_GroupConvTestCases = generateSingleGroupConvCPUTestCases( // channel <= 16 diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/matmul.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/matmul.cpp index ba14e14d2c8..6d03d82af75 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/matmul.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/matmul.cpp @@ -251,7 +251,7 @@ std::vector filterSpecificParams_BrgemmAmx() { std::vector filterSpecificParams_Brgconv1x1() { std::vector specificParams; if (with_cpu_x86_avx512_core()) { - specificParams.push_back(CPUSpecificParams{{}, {}, {"brgconv_avx512_1x1"}, "brgconv_avx512_1x1"}); + specificParams.push_back(CPUSpecificParams{{}, {}, {/* brgconv_avx512_1x1 is not a part of fc impl list */}, "brgconv_avx512_1x1"}); } return specificParams; diff --git a/src/plugins/intel_cpu/tests/functional/test_utils/convolution_params.hpp b/src/plugins/intel_cpu/tests/functional/test_utils/convolution_params.hpp index 2585f42e867..b220edf4deb 100644 --- a/src/plugins/intel_cpu/tests/functional/test_utils/convolution_params.hpp +++ b/src/plugins/intel_cpu/tests/functional/test_utils/convolution_params.hpp @@ -11,9 +11,9 @@ namespace CPUTestUtils { const auto conv_ref_2D = CPUSpecificParams{{nchw}, {nchw}, {"ref_any"}, "ref_any"}; const auto conv_ref_3D = CPUSpecificParams{{ncdhw}, {ncdhw}, {"ref_any"}, "ref_any"}; - const auto conv_gemm_1D = CPUSpecificParams{{ncw}, {ncw}, {"gemm_any"}, "jit_gemm"}; - const auto conv_gemm_2D = CPUSpecificParams{{nchw}, {nchw}, {"gemm_any"}, "jit_gemm"}; - const auto conv_gemm_3D = CPUSpecificParams{{ncdhw}, {ncdhw}, {"gemm_any"}, "jit_gemm"}; + const auto conv_gemm_1D = CPUSpecificParams{{ncw}, {ncw}, {"jit_gemm"}, "jit_gemm"}; + const auto conv_gemm_2D = CPUSpecificParams{{nchw}, {nchw}, {"jit_gemm"}, "jit_gemm"}; + const auto conv_gemm_3D = CPUSpecificParams{{ncdhw}, {ncdhw}, {"jit_gemm"}, "jit_gemm"}; const auto conv_gemm_1D_nspc = CPUSpecificParams{{nwc}, {nwc}, {"jit_gemm"}, "jit_gemm"}; const auto conv_gemm_2D_nspc = CPUSpecificParams{{nhwc}, {nhwc}, {"jit_gemm"}, "jit_gemm"};