[CPU] Plugin migration on ngraph (#4344)

This commit is contained in:
Gorokhov Dmitriy 2021-05-06 19:49:24 +03:00 committed by GitHub
parent 2bb8e9facc
commit a19413c0c0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
411 changed files with 14378 additions and 51599 deletions

View File

@ -54,21 +54,16 @@ if(SELECTIVE_BUILD STREQUAL "ON")
endif()
endif()
target_link_libraries(${TARGET_NAME} PRIVATE mkldnn inference_engine inference_engine_legacy
inference_engine_transformations inference_engine_lp_transformations)
target_link_libraries(${TARGET_NAME} PRIVATE mkldnn
inference_engine
inference_engine_transformations
inference_engine_lp_transformations)
target_include_directories(${TARGET_NAME} PRIVATE
$<TARGET_PROPERTY:mkldnn,INCLUDE_DIRECTORIES>)
# Cross compiled function
# TODO: The same for proposal, proposalONNX, topk
cross_compiled_file(${TARGET_NAME}
ARCH AVX512F AVX2 SSE42 ANY
nodes/argmax_imp.cpp
API nodes/argmax_imp.hpp
NAME arg_max_execute
NAMESPACE InferenceEngine::Extensions::Cpu::XARCH
)
cross_compiled_file(${TARGET_NAME}
ARCH AVX2 ANY
nodes/proposal_imp.cpp
@ -85,7 +80,6 @@ add_library(${TARGET_NAME}_obj OBJECT ${SOURCES} ${HEADERS})
target_link_libraries(${TARGET_NAME}_obj PUBLIC mkldnn)
target_include_directories(${TARGET_NAME}_obj PRIVATE $<TARGET_PROPERTY:inference_engine_preproc_s,INTERFACE_INCLUDE_DIRECTORIES>
$<TARGET_PROPERTY:inference_engine_legacy,INTERFACE_INCLUDE_DIRECTORIES>
$<TARGET_PROPERTY:inference_engine_transformations,INTERFACE_INCLUDE_DIRECTORIES>
$<TARGET_PROPERTY:openvino::itt,INTERFACE_INCLUDE_DIRECTORIES>
$<TARGET_PROPERTY:inference_engine_lp_transformations,INTERFACE_INCLUDE_DIRECTORIES>

View File

@ -0,0 +1,124 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
namespace MKLDNNPlugin {
enum Algorithm {
Undefined,
// Pooling algorithms
PoolingMax,
PoolingAvg,
// Convolution algorithms
ConvolutionCommon,
ConvolutionGrouped,
// Convolution algorithms
DeconvolutionCommon,
DeconvolutionGrouped,
// Elementwise algorithms
EltwiseAdd,
EltwiseMultiply,
EltwiseSubtract,
EltwiseDivide,
EltwiseFloorMod,
EltwiseMod,
EltwiseMaximum,
EltwiseMinimum,
EltwiseSquaredDifference,
EltwisePowerDynamic,
EltwisePowerStatic,
EltwiseMulAdd,
EltwiseEqual,
EltwiseNotEqual,
EltwiseGreater,
EltwiseGreaterEqual,
EltwiseLess,
EltwiseLessEqual,
EltwiseLogicalAnd,
EltwiseLogicalOr,
EltwiseLogicalXor,
EltwiseLogicalNot,
EltwiseRelu,
EltwiseGelu,
EltwiseElu,
EltwiseTanh,
EltwiseSigmoid,
EltwiseAbs,
EltwiseSqrt,
EltwiseSoftRelu,
EltwiseExp,
EltwiseClamp,
EltwiseSwish,
EltwisePrelu,
EltwiseMish,
EltwiseHswish,
EltwiseHsigmoid,
EltwiseRoundHalfToEven,
EltwiseRoundHalfAwayFromZero,
EltwiseErf,
// FakeQuantize algorithms
FQCommon,
FQQuantization,
FQBinarization,
// ROIPooling algorithms
ROIPoolingMax,
ROIPoolingBilinear,
// ROIAlign algorithms
ROIAlignMax,
ROIAlignAvg,
// PSROIPooling algorithms
PSROIPoolingAverage,
PSROIPoolingBilinear,
PSROIPoolingBilinearDeformable,
// Reduce algorithms
ReduceL1,
ReduceL2,
ReduceAnd,
ReduceOr,
ReduceMax,
ReduceMean,
ReduceMin,
ReduceProd,
ReduceSum,
ReduceLogSum,
ReduceLogSumExp,
ReduceSumSquare,
// Math algorithms
MathAbs,
MathAcos,
MathAcosh,
MathAsin,
MathAsinh,
MathAtan,
MathAtanh,
MathCeiling,
MathCos,
MathCosh,
MathErf,
MathFloor,
MathHardSigmoid,
MathLog,
MathNegative,
MathReciprocal,
MathSelu,
MathSign,
MathSin,
MathSinh,
MathSoftPlus,
MathSoftsign,
MathTan
};
} // namespace MKLDNNPlugin

View File

@ -4,9 +4,8 @@
#include "jit_eltwise_emitters.hpp"
#include <cpu/x64/jit_uni_eltwise.hpp>
#include "legacy/ie_layers.h"
#include <ngraph/opsets/opset1.hpp>
#include <nodes/mkldnn_eltwise_node.h>
using namespace InferenceEngine;
using namespace mkldnn::impl::utils;
@ -1305,15 +1304,16 @@ jit_power_static_emitter::jit_power_static_emitter(jit_generator *host, cpu_isa_
prepare_table();
}
jit_power_static_emitter::jit_power_static_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
: jit_emitter(host, host_isa, node, exec_prc) {
auto *powerLayer = dynamic_cast<InferenceEngine::PowerLayer *>(node->getCnnLayer().get());
if (powerLayer == nullptr)
IE_THROW() << "Cannot convert power layer.";
power = powerLayer->power;
scale = powerLayer->scale;
shift = powerLayer->offset;
const MKLDNNEltwiseNode *powerNode = dynamic_cast<const MKLDNNEltwiseNode *>(node);
if (powerNode == nullptr) {
IE_THROW() << "Can't cast to MKLDNNEltwiseNode";
}
power = powerNode->getAlpha();
scale = powerNode->getBeta();
shift = powerNode->getGamma();
prepare_table();
}

View File

@ -4,7 +4,6 @@
#include "jit_emitter.hpp"
#include "jit_load_store_emitters.hpp"
#include "legacy/ie_layers.h"
#include <cpu/x64/jit_generator.hpp>
#include "utils/bfloat16.hpp"

View File

@ -25,7 +25,7 @@ jit_mkldnn_emitter::jit_mkldnn_emitter(jit_generator *host, cpu_isa_t host_isa,
jit_mkldnn_emitter::jit_mkldnn_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc)
: jit_emitter(host, host_isa, node, exec_prc) {
auto eltwiseNode = dynamic_cast<const MKLDNNEltwiseNode*>(node);
kind = static_cast<mkldnn_alg_kind_t>(eltwiseNode->getAlgorithm());
kind = static_cast<mkldnn_alg_kind_t>(eltwiseNode->getMKLDNNAlgorithm());
alpha = eltwiseNode->getAlpha();
beta = eltwiseNode->getBeta();

View File

@ -23,18 +23,6 @@ size_t MKLDNNDescriptor::outputNumbers() const {
return 1;
}
MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::batch_normalization_forward::desc> desc) {
this->desc.reset(new DescFwdImpl<mkldnn::batch_normalization_forward::desc>(desc));
}
MKLDNNDescriptor::operator std::shared_ptr<mkldnn::batch_normalization_forward::desc>() {
auto typeDesc = std::dynamic_pointer_cast<DescFwdImpl<mkldnn::batch_normalization_forward::desc>>(desc);
if (typeDesc == nullptr) {
IE_THROW() << "Cannot cast descriptor!";
}
return typeDesc->getPtr();
}
MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::convolution_forward::desc> desc) {
this->desc.reset(new DescFwdImpl<mkldnn::convolution_forward::desc>(desc));
}

View File

@ -10,9 +10,6 @@
class MKLDNNDescriptor {
public:
explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::batch_normalization_forward::desc> desc);
operator std::shared_ptr<mkldnn::batch_normalization_forward::desc>();
explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::convolution_forward::desc> desc);
operator std::shared_ptr<mkldnn::convolution_forward::desc>();

View File

@ -6,6 +6,7 @@
#include "mkldnn_node.h"
#include "mkldnn_extension_utils.h"
#include <blob_factory.hpp>
#include "utils/cpu_utils.hpp"
using namespace mkldnn;
namespace MKLDNNPlugin {
@ -603,7 +604,7 @@ InferenceEngine::Blob::Ptr MKLDNNEdge::getBlob() {
else
desc = InferenceEngine::TensorDesc(desc.getPrecision(), dims.ToSizeVector(), desc.getBlockingDesc());
return make_blob_with_precision(desc, memoryPtr->GetData());
return isEmptyTensorDesc(desc) ? make_blob_with_precision(desc) : make_blob_with_precision(desc, memoryPtr->GetData());
}
void MKLDNNEdge::sharedMemFrom(const MKLDNNEdgePtr &edge) {

View File

@ -4,7 +4,6 @@
#include <ie_metric_helpers.hpp>
#include <precision_utils.h>
#include <legacy/net_pass.h>
#include "mkldnn_exec_network.h"
#include "mkldnn_async_infer_request.h"
@ -12,8 +11,6 @@
#include "mkldnn_memory_state.h"
#include "mkldnn_itt.h"
#include "nodes/mkldnn_memory_node.hpp"
#include <legacy/ie_util_internal.hpp>
#include <legacy/graph_tools.hpp>
#include <threading/ie_executor_manager.hpp>
#include <threading/ie_cpu_streams_executor.hpp>
@ -23,7 +20,8 @@
#include <unordered_set>
#include <utility>
#include <cstring>
#include <legacy/details/ie_cnn_network_tools.h>
#include <ngraph/opsets/opset1.hpp>
#include <transformations/utils/utils.hpp>
using namespace MKLDNNPlugin;
using namespace InferenceEngine;
@ -43,189 +41,17 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::CNNNetwork &network,
extensionManager(extMgr),
_cfg{cfg},
_name{network.getName()},
_numaNodesWeights(numaNodesWeights) {
OV_ITT_SCOPE_CHAIN(FIRST_INFERENCE, taskChain, MKLDNNPlugin::itt::domains::MKLDNN_LT, "MKLDNNExecNetwork", "cloneNet");
// we are cloning network if we have statistics and we can transform network.
_clonedNetwork = cloneNetwork(network);
bool isFloatModel = true;
if (_cfg.lpTransformsMode == Config::LPTransformsMode::On) {
// Check if network is INT8 or Binary.
CNNNetworkIterator iter(network);
while (iter != CNNNetworkIterator()) {
if (CaselessEq<std::string>()((*iter)->type, "FakeQuantize")) {
isFloatModel = false;
break;
}
iter++;
}
auto changePrecisionBF16 = [&](Precision current, Precision target) {
InputsDataMap inputs = _clonedNetwork.getInputsInfo();
OutputsDataMap outputs = _clonedNetwork.getOutputsInfo();
CNNNetworkIterator iter(_clonedNetwork);
while (iter != CNNNetworkIterator()) {
// check, if memory output node needs to be transformed
if (current == Precision::FP32 &&
(*iter)->type == "Memory" && (*iter)->outData.size() == 0 &&
(*iter)->insData[0].lock()->getPrecision() == current) {
(*iter)->insData[0].lock()->setPrecision(target);
}
for (size_t o = 0; o < (*iter)->outData.size(); o++) {
if (inputs.find((*iter)->outData[o]->getName()) == inputs.end()
&& outputs.find((*iter)->outData[o]->getName()) == outputs.end()
&& !CaselessEq<std::string>()((*iter)->type, "const")
&& (*iter)->outData[o]->getPrecision() == current) {
(*iter)->outData[o]->setPrecision(target);
}
}
iter++;
}
};
if (with_cpu_x86_avx512_core()) {
// If enforceBF16 flag was set, BF16 transformation applies for all layers supported by CPU plugin.
// Otherwise, only layers marked as BF16 in '_clonedNetwork' will be performed in bfloat16 mode.
// CPU plugin throws an exception, if marked as BF16 layers have not supported by CPU plugin.
// BF16 + INT8 or BF16 + BIN models will be performed in mixed precision execution only if
// enforceBF16 flag was set manually
if (isFloatModel == false) {
if (cfg.manualEnforceBF16 == true)
changePrecisionBF16(Precision::FP32, Precision::BF16);
} else if (cfg.enforceBF16 == true) {
changePrecisionBF16(Precision::FP32, Precision::BF16);
}
} else {
changePrecisionBF16(Precision::BF16, Precision::FP32);
}
_numaNodesWeights(numaNodesWeights),
_network(network) {
auto function = network.getFunction();
if (function == nullptr) {
IE_THROW() << "CPU plug-in doesn't support not ngraph-based model!";
}
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "createConstInputs");
auto createConstInputTo = [&](CNNLayerPtr layer, Blob::Ptr blob, const std::vector<size_t>& shape, const std::string& name) {
LayerParams attrs = {layer->name + "_const_" + name, "Const", blob->getTensorDesc().getPrecision()};
auto constLayer = std::make_shared<InferenceEngine::CNNLayer>(attrs);
constLayer->blobs["custom"] = blob;
const TensorDesc& td = {blob->getTensorDesc().getPrecision(), shape, TensorDesc::getLayoutByDims(shape)};
DataPtr newEdgeAfterLayer(new Data(constLayer->name, td));
newEdgeAfterLayer->setName(constLayer->name);
getCreatorLayer(newEdgeAfterLayer) = constLayer;
getInputTo(newEdgeAfterLayer).clear();
IE_SUPPRESS_DEPRECATED_START
auto icnnnet = static_cast<ICNNNetwork::Ptr>(_clonedNetwork);
IE_SUPPRESS_DEPRECATED_END
auto implNetwork = std::dynamic_pointer_cast<details::CNNNetworkImpl>(icnnnet);
IE_ASSERT(implNetwork != nullptr);
implNetwork->addData(constLayer->name.c_str(), newEdgeAfterLayer);
implNetwork->addLayer(constLayer);
constLayer->outData.push_back(newEdgeAfterLayer);
getInputTo(newEdgeAfterLayer)[layer->name] = layer;
layer->insData.push_back(newEdgeAfterLayer);
};
// The code block below transforms legacy layers to the form more compatible with opset1 in order to simplify future migration
// TODO: remove after plug-in is migrated on opset1
auto all_layers = details::CNNNetSortTopologically(_clonedNetwork);
for (auto &layer : all_layers) {
if (layer->type == "ScaleShift" && layer->insData.size() == 1) {
auto constDimsRank = layer->insData[0].lock()->getDims().size();
Blob::Ptr scalesBlob = layer->blobs["weights"];
if (scalesBlob != nullptr) {
std::vector<size_t> shape(constDimsRank, 1);
shape[shape.size() > 1 ? 1 : 0] = scalesBlob->size();
createConstInputTo(layer, scalesBlob, shape, "weights");
}
Blob::Ptr shiftBlob = layer->blobs["biases"];
if (shiftBlob != nullptr) {
std::vector<size_t> shape(constDimsRank, 1);
shape[shape.size() > 1 ? 1 : 0] = shiftBlob->size();
createConstInputTo(layer, shiftBlob, shape, "biases");
} else if (scalesBlob != nullptr) {
Blob::Ptr biases = make_shared_blob<float>(scalesBlob->getTensorDesc());
if (biases == nullptr)
IE_THROW() << "Cannot make 'biases' shared blob";
biases->allocate();
auto biasesPtr = biases->buffer().as<float*>();
for (size_t i = 0; i < biases->size(); i++)
biasesPtr[i] = 0;
std::vector<size_t> shape(constDimsRank, 1);
shape[shape.size() > 1 ? 1 : 0] = biases->size();
createConstInputTo(layer, biases, shape, "biases");
}
} else if (layer->type == "PReLU" && layer->insData.size() == 1) {
Blob::Ptr scalesBlob = layer->blobs["weights"];
if (scalesBlob != nullptr) {
std::vector<size_t> shape(layer->insData[0].lock()->getDims().size(), 1);
shape[shape.size() > 1 ? 1 : 0] = scalesBlob->size();
createConstInputTo(layer, scalesBlob, shape, "weights");
}
} else if (layer->type == "DeformableConvolution") {
auto * defConvLayer = dynamic_cast<DeformableConvolutionLayer*>(layer.get());
if (defConvLayer == nullptr)
IE_THROW() << "Cannot convert deformable convolution layer.";
Blob::Ptr weightsBlob = defConvLayer->blobs["weights"];
if (weightsBlob != nullptr) {
std::vector<size_t> shape;
if (defConvLayer->_group != 1) {
shape.push_back(defConvLayer->_group);
}
shape.push_back(defConvLayer->_out_depth);
shape.push_back(defConvLayer->input()->getDims()[1]);
for (int i = 1; i <= defConvLayer->_kernel.size(); i++) {
shape.push_back(defConvLayer->_kernel[defConvLayer->_kernel.size() - i]);
}
createConstInputTo(layer, weightsBlob, shape, "weights");
defConvLayer->blobs.clear();
defConvLayer->_weights = nullptr;
}
} else if (layer->type == "BinaryConvolution") {
auto * binConvLayer = dynamic_cast<BinaryConvolutionLayer*>(layer.get());
if (binConvLayer == nullptr)
IE_THROW() << "Cannot convert binary convolution layer.";
Blob::Ptr weightsBlob = binConvLayer->blobs["weights"];
if (weightsBlob != nullptr) {
std::vector<size_t> shape;
if (binConvLayer->_group != 1) {
shape.push_back(binConvLayer->_group);
}
shape.push_back(binConvLayer->_out_depth);
shape.push_back(binConvLayer->input()->getDims()[1]);
for (int i = 1; i <= binConvLayer->_kernel.size(); i++) {
shape.push_back(binConvLayer->_kernel[binConvLayer->_kernel.size() - i]);
}
createConstInputTo(layer, weightsBlob, shape, "weights");
binConvLayer->blobs.clear();
binConvLayer->_weights = nullptr;
}
}
}
OV_ITT_TASK_SKIP(taskChain);
bool isFloatModel = !ngraph::op::util::has_op_with_type<ngraph::op::FakeQuantize>(function);
if (_cfg.batchLimit > 1) {
// check topology for applicability
if (!CanProcessDynBatch(_clonedNetwork)) {
if (!CanProcessDynBatch(_network)) {
IE_THROW() << "MKLDNNGraph::CreateGraph: such topology cannot be compiled for dynamic batch!";
}
}
@ -293,12 +119,11 @@ MKLDNNExecNetwork::Graph::Lock MKLDNNExecNetwork::GetGraph() {
std::exception_ptr exception;
auto makeGraph = [&] {
try {
auto localNetwork = cloneNetwork(_clonedNetwork);
{
std::lock_guard<std::mutex> lock{_cfgMutex};
graphLock._graph.setConfig(_cfg);
}
graphLock._graph.CreateGraph(localNetwork, extensionManager, _numaNodesWeights[numaNodeId]);
graphLock._graph.CreateGraph(_network, extensionManager, _numaNodesWeights[numaNodeId]);
} catch(...) {
exception = std::current_exception();
}
@ -386,53 +211,48 @@ InferenceEngine::Parameter MKLDNNExecNetwork::GetMetric(const std::string &name)
bool MKLDNNExecNetwork::CanProcessDynBatch(const InferenceEngine::CNNNetwork &network) const {
InputsDataMap inputs = network.getInputsInfo();
CNNLayerSet inputLayers;
std::unordered_set<CNNLayer *> allLayers;
if (inputs.empty())
return false;
auto & secondLayers = getInputTo(inputs.begin()->second->getInputData());
if (secondLayers.empty())
return false;
auto function = network.getFunction();
if (function == nullptr) {
IE_THROW() << "CPU plug-in doesn't support not ngraph-based model!";
}
bool check_result = true;
details::UnorderedDFS(allLayers, secondLayers.begin()->second, [&](CNNLayerPtr layer) {
auto type = TypeFromName(layer->type);
// This is WA for Tile layer
auto tileLayer = dynamic_cast<TileLayer *>(layer.get());
if (tileLayer && tileLayer->axis)
return;
auto ops = function->get_ordered_ops();
for (auto op : ops) {
auto type = TypeFromName(op->get_type_name());
if (type == Tile) {
const auto tile = std::dynamic_pointer_cast<const ngraph::opset1::Tile>(op);
const auto repeatsNode = std::dynamic_pointer_cast<const ngraph::opset1::Constant>(tile->get_input_node_shared_ptr(1));
if (!repeatsNode)
return false;
if (tile && repeatsNode->cast_vector<int64_t>()[0] == 1)
continue;
}
auto reshapeLayer = dynamic_cast<ReshapeLayer *>(layer.get());
if (reshapeLayer &&
type == Reshape &&
(reshapeLayer->outData[0]->getTensorDesc().getDims()[0] ==
reshapeLayer->insData[0].lock()->getTensorDesc().getDims()[0])) {
return;
if (type == Reshape) {
if (op->get_input_shape(0)[0] == op->get_output_shape(0)[0])
continue;
}
if (type != Input &&
type != Output &&
type != Convolution &&
type != Deconvolution &&
type != Activation &&
type != Depthwise &&
type != Lrn &&
type != Pooling &&
type != FullyConnected &&
type != Gemm &&
type != SoftMax &&
type != MatMul &&
type != Softmax &&
type != Split &&
type != Concatenation &&
type != Eltwise &&
type != BatchNormalization &&
type != Copy) {
check_result = false;
type != Eltwise) {
return false;
}
}, false);
}
return check_result;
return true;
}
IE_SUPPRESS_DEPRECATED_START

View File

@ -14,7 +14,6 @@
#include <memory>
#include <map>
#include <string>
#include <legacy/cnn_network_impl.hpp>
#include <unordered_map>
namespace MKLDNNPlugin {
@ -49,7 +48,7 @@ protected:
friend class MKLDNNInferRequest;
MKLDNNExtensionManager::Ptr extensionManager;
std::vector<InferenceEngine::IVariableStateInternal::Ptr> memoryStates;
InferenceEngine::CNNNetwork _clonedNetwork;
const InferenceEngine::CNNNetwork _network;
std::mutex _cfgMutex;
Config _cfg;
std::atomic_int _numRequests = {0};

View File

@ -31,17 +31,14 @@ InferenceEngine::ILayerImpl::Ptr MKLDNNExtensionManager::CreateImplementation(co
return nullptr;
}
std::shared_ptr<InferenceEngine::ILayerImplFactory> MKLDNNExtensionManager::CreateExtensionFactory(
const InferenceEngine::CNNLayerPtr &layer) {
if (!layer)
IE_THROW() << "Cannot get cnn layer!";
std::shared_ptr<InferenceEngine::ILayerImplFactory> MKLDNNExtensionManager::CreateExtensionFactory(const std::shared_ptr<ngraph::Node>& op) {
std::shared_ptr<ILayerImplFactory> factory;
for (auto& ext : _extensions) {
ResponseDesc responseDesc;
StatusCode rc = GENERAL_ERROR;
ILayerImplFactory* factory_ptr = nullptr;
if (auto mkldnnExt = std::dynamic_pointer_cast<Extensions::Cpu::MKLDNNExtensions>(ext))
rc = mkldnnExt->getFactoryFor(factory_ptr, layer.get(), &responseDesc);
rc = mkldnnExt->getFactoryFor(factory_ptr, op, &responseDesc);
if (rc != OK) {
factory = nullptr;
continue;

View File

@ -8,7 +8,6 @@
#include <vector>
#include <memory>
#include <ie_iextension.h>
#include <legacy/ie_layers.h>
#include "nodes/list.hpp"
namespace MKLDNNPlugin {
@ -18,7 +17,7 @@ public:
using Ptr = std::shared_ptr<MKLDNNExtensionManager>;
MKLDNNExtensionManager() = default;
InferenceEngine::ILayerImpl::Ptr CreateImplementation(const std::shared_ptr<ngraph::Node>& op);
std::shared_ptr<InferenceEngine::ILayerImplFactory> CreateExtensionFactory(const InferenceEngine::CNNLayerPtr& Layer);
std::shared_ptr<InferenceEngine::ILayerImplFactory> CreateExtensionFactory(const std::shared_ptr<ngraph::Node>& op);
void AddExtension(InferenceEngine::IExtensionPtr extension);
private:

View File

@ -26,11 +26,9 @@
#include <nodes/mkldnn_reorder_node.h>
#include <nodes/mkldnn_convert_node.h>
#include <legacy/graph_tools.hpp>
#include <ie_algorithm.hpp>
#include <blob_factory.hpp>
#include <legacy/net_pass.h>
#include <legacy/details/ie_cnn_network_tools.h>
#include "nodes/common/cpu_memcpy.h"
#include "nodes/common/cpu_convert.h"
#include "precision_utils.h"
@ -39,6 +37,14 @@
#include "utils/general_utils.h"
#include "utils/debug_capabilities.h"
#include "utils/node_dumper.h"
#include "utils/ngraph_utils.hpp"
#include "utils/cpu_utils.hpp"
#include <ngraph/node.hpp>
#include <ngraph/function.hpp>
#include <ngraph/variant.hpp>
#include <ngraph/ops.hpp>
#include <transformations/utils/utils.hpp>
/*****************************************************
* Debug capability
@ -60,31 +66,7 @@ typedef std::vector<edge_cluster_t> edge_clusters_t;
mkldnn::engine MKLDNNGraph::eng(mkldnn::engine::kind::cpu, 0);
template<typename NET>
void MKLDNNGraph::ApplyUnrollPasses(NET &net) {
OV_ITT_SCOPED_TASK(itt::domains::MKLDNNPlugin, "MKLDNNGraph::ApplyUnrollPasses");
NetPass::CombineRNNSeq(net);
bool ti_proc_ok = NetPass::UnrollRNN_if(net, [] (const RNNCellBase &rnn) -> bool {
if (rnn.clip != 0.0f)
return true;
if ((rnn.cellType == RNNCellBase::GRU || rnn.cellType == RNNCellBase::GRU_LBR) &&
rnn.activations != std::vector<std::string> {"sigmoid", "tanh"})
return true;
if (rnn.cellType == RNNCellBase::LSTM &&
rnn.activations != std::vector<std::string> {"sigmoid", "tanh", "tanh"})
return true;
return false;
});
if (!ti_proc_ok)
IE_THROW() << "Plugin doesn't support Tensor Iterator in pure form. "
"None TI optimization pattern has been applied successfully";
}
template void MKLDNNGraph::ApplyUnrollPasses(TensorIterator::Body&);
template void MKLDNNGraph::ApplyUnrollPasses(CNNNetwork&);
template<typename NET>
void MKLDNNGraph::CreateGraph(const NET &net, const MKLDNNExtensionManager::Ptr& extMgr,
void MKLDNNGraph::CreateGraph(NET &net, const MKLDNNExtensionManager::Ptr& extMgr,
MKLDNNWeightsSharing::Ptr &w_cache) {
OV_ITT_SCOPE(FIRST_INFERENCE, MKLDNNPlugin::itt::domains::MKLDNN_LT, "CreateGraph");
@ -98,233 +80,252 @@ void MKLDNNGraph::CreateGraph(const NET &net, const MKLDNNExtensionManager::Ptr&
status = Ready;
}
template void MKLDNNGraph::CreateGraph(const TensorIterator::Body&,
template void MKLDNNGraph::CreateGraph(const std::shared_ptr<const ngraph::Function>&,
const MKLDNNExtensionManager::Ptr&, MKLDNNWeightsSharing::Ptr&);
template void MKLDNNGraph::CreateGraph(const CNNNetwork&,
const MKLDNNExtensionManager::Ptr&, MKLDNNWeightsSharing::Ptr&);
void MKLDNNGraph::Replicate(const TensorIterator::Body &subgraph, const MKLDNNExtensionManager::Ptr& extMgr) {
void MKLDNNGraph::Replicate(const std::shared_ptr<const ngraph::Function> &subgraph, const MKLDNNExtensionManager::Ptr& extMgr) {
this->_name = "subgraph";
this->reuse_io_tensors = false;
// Map data object onto producer layer(node)
std::unordered_map<Data*, std::pair<MKLDNNNodePtr, int>> data2node;
// Map data object onto producer node
std::map<std::shared_ptr<ngraph::Node>, std::pair<MKLDNNNodePtr, int>> op2node;
// nodes which has no consumers (output or just unused). But doesn't marked as graph output.
// Will be stored as fake output separately.
std::unordered_set<DataPtr> unused_data;
std::deque<ngraph::Output<ngraph::Node>> unusedOutputs;
// Step 1. Replicate input nodes
for (const auto &input : subgraph.inputs) {
if (input->getPrecision() == Precision::UNSPECIFIED) continue; // const node holder
auto creator = getCreatorLayer(input).lock();
if (creator == nullptr) {
creator.reset(new CNNLayer({input->getName(), "Input", input->getTensorDesc().getPrecision()}));
creator->outData.push_back(input);
auto getParentOutputPort = [](const std::shared_ptr<ngraph::Node> childOp, const std::shared_ptr<ngraph::Node> parentOp,
const size_t childInputPort) -> int {
for (size_t parentPort = 0; parentPort < parentOp->get_output_size(); parentPort++) {
if (childOp->input(childInputPort).get_tensor_ptr() == parentOp->output(parentPort).get_tensor_ptr()) {
return static_cast<int>(parentPort);
}
}
const MKLDNNNodePtr node(MKLDNNNode::factory().create(creator, getEngine(), extMgr, weightsCache));
data2node[input.get()] = {node, 0};
return -1;
};
for (const auto op : subgraph->get_ordered_ops()) {
const MKLDNNNodePtr node {MKLDNNNode::factory().create(op, getEngine(), extMgr, weightsCache)};
graphNodes.push_back(node);
inputNodes[input->getName()] = node;
if (getInputTo(input).empty()) {
unused_data.insert(input);
if (op->get_type_info() == ngraph::op::v0::Parameter::type_info) {
inputNodesMap[node->getName()] = node;
}
}
// Step 2. Replicate all internal nodes.
for (const auto layer : NetPass::TIBodySortTopologically(subgraph)) {
const MKLDNNNodePtr node {MKLDNNNode::factory().create(layer, getEngine(), extMgr, weightsCache)};
graphNodes.push_back(node);
if (op->get_type_info() == ngraph::op::v0::Result::type_info) {
auto prev = op->get_input_node_shared_ptr(0);
std::string inputID;
inputID = prev->get_friendly_name();
if (prev->get_output_size() > 1) {
inputID += "." + std::to_string(op->get_input_source_output(0).get_index());
}
for (int port = 0; port < layer->insData.size(); port++) {
auto data = layer->insData[port].lock();
outputNodesMap[inputID] = node;
}
auto port_info = data2node[data.get()];
auto parent_node = port_info.first;
auto parent_port_idx = port_info.second;
for (size_t port = 0; port < op->get_input_size(); port++) {
auto parentOp = op->get_input_node_shared_ptr(port);
MKLDNNEdgePtr edge(new MKLDNNEdge(parent_node, node, parent_port_idx, port));
auto portInfo = op2node[parentOp];
auto parentNode = portInfo.first;
MKLDNNEdgePtr edge(new MKLDNNEdge(parentNode, node, getParentOutputPort(op, parentOp, port), port));
node->addEdge(edge);
graphEdges.push_back(edge);
}
int out_port_idx = 0;
for (auto &out_data : layer->outData) {
data2node[out_data.get()] = {node, out_port_idx++};
if (getInputTo(out_data).empty()) {
unused_data.insert(out_data);
if (!MKLDNNPlugin::one_of(op->get_type_info(),
ngraph::op::v0::Result::type_info,
ngraph::op::v3::Assign::type_info,
ngraph::op::v6::Assign::type_info)) {
int outPortIdx = 0;
for (int oi = 0; oi < op->get_output_size(); oi++) {
op2node[op->output(oi).get_node_shared_ptr()] = {node, outPortIdx++};
if (op->get_output_target_inputs(oi).empty()) {
unusedOutputs.push_back(op->output(oi));
}
}
}
}
// Step 3. Add output nodes and output stubs for unused data objects.
for (const auto &output : subgraph.outputs) {
auto port_info = data2node[output.get()];
auto parent_node = port_info.first;
auto parent_port_idx = port_info.second;
CNNLayerPtr layer(new CNNLayer({"out_" + output->getName(), "Output", output->getTensorDesc().getPrecision()}));
layer->insData.push_back(output);
const MKLDNNNodePtr node {MKLDNNNode::factory().create(layer, getEngine(), extMgr, weightsCache)};
MKLDNNEdgePtr edge(new MKLDNNEdge(parent_node, node, parent_port_idx, 0));
node->addEdge(edge);
graphEdges.push_back(edge);
graphNodes.push_back(node);
outputNodes.push_back(node);
unused_data.erase(output);
}
// Add stub output node for unused data
for (auto to_stub_data : unused_data) {
auto port_info = data2node[to_stub_data.get()];
auto parent_node = port_info.first;
auto parent_port_idx = port_info.second;
CNNLayerPtr layer(new CNNLayer({"stub_" + to_stub_data->getName(), "Output", to_stub_data->getTensorDesc().getPrecision()}));
layer->insData.push_back(to_stub_data);
const MKLDNNNodePtr node(MKLDNNNode::factory().create(layer, getEngine(), extMgr, weightsCache));
MKLDNNEdgePtr edge(new MKLDNNEdge(parent_node, node, parent_port_idx, 0));
node->addEdge(edge);
for (auto unusedOutput : unusedOutputs) {
auto portInfo = op2node[unusedOutput.get_node_shared_ptr()];
auto parentNode = portInfo.first;
auto port = portInfo.second;
const auto nodeName = std::string("stub_") + std::to_string(unusedOutput.get_index()) + "_" + parentNode->getName();
const MKLDNNNodePtr outNode = std::make_shared<MKLDNNInputNode>(parentNode->outDims[port].ToSizeVector(),
parentNode->getOriginalOutputPrecisionAtPort(port),
nodeName, "Result", getEngine(), weightsCache);
MKLDNNEdgePtr edge(new MKLDNNEdge(parentNode, outNode, port, 0));
outNode->addEdge(edge);
graphEdges.push_back(edge);
graphNodes.push_back(node);
graphNodes.push_back(outNode);
}
}
void MKLDNNGraph::Replicate(const CNNNetwork &network, const MKLDNNExtensionManager::Ptr& extMgr) {
OV_ITT_SCOPE_CHAIN(FIRST_INFERENCE, taskChain, itt::domains::MKLDNN_LT, "MKLDNNGraph::Replicate", "CNNNetwork");
InputsDataMap inputs = network.getInputsInfo();
InputsDataMap inputsInfo = network.getInputsInfo();
OutputsDataMap outputsInfo = network.getOutputsInfo();
this->_name = network.getName();
// The input layer precision has to be equal to the InputData precision
std::map<std::string, Precision> changedPrecision;
for (const auto& input : inputs) {
auto inputLayer = getCreatorLayer(input.second->getInputData()).lock();
if (inputLayer) {
inputLayer->precision = inputLayer->outData[0]->getTensorDesc().getPrecision();
}
std::shared_ptr<const ngraph::Function> func = network.getFunction();
if (!func) {
IE_THROW() << "Function pointer inside CNNNetwork is nullptr";
}
std::unordered_map<CNNLayerPtr, MKLDNNNodePtr> layer2node;
std::unordered_set<DataPtr> unused_data; // nodes which has no consumers (output or just unused)
auto orderedOps = func->get_ordered_ops();
// TODO [NM]: unordered_map is preferred from performance perspective. Needs hash for ngraph::Node
std::map<std::shared_ptr<ngraph::Node>, MKLDNNNodePtr> op2node;
std::deque<ngraph::Output<ngraph::Node>> unusedOutputs; // nodes which has no consumers (output or just unused)
auto getParentOutputPort = [](const std::shared_ptr<ngraph::Node> childOp, const std::shared_ptr<ngraph::Node> parentOp,
const size_t childInputPort) -> int {
for (size_t parentPort = 0; parentPort < parentOp->get_output_size(); parentPort++) {
if (childOp->input(childInputPort).get_tensor_ptr() == parentOp->output(parentPort).get_tensor_ptr()) {
return static_cast<int>(parentPort);
}
}
auto _parent_port = [] (const DataPtr &data) -> int {
auto parent = getCreatorLayer(data).lock();
for (int i = 0; parent->outData.size(); i++)
if (data == parent->outData[i])
return i;
return -1;
};
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "AllNodes");
// Replicate All Nodes in topological order
for (const auto layer : CNNNetSortTopologically(network)) {
CNNLayerPtr _layer = layer;
if (layer->type == "Memory" && layer->GetParamAsString("index") == "1") {
auto memoryId = layer->GetParamAsString("id");
Precision portPrecision = layer->outData[0]->getTensorDesc().getPrecision();
_layer.reset(new CNNLayer({layer->name + "/id=" + memoryId, "MemoryInput", portPrecision}));
_layer->params = layer->params;
_layer->outData = layer->outData;
}
const MKLDNNNodePtr node(MKLDNNNode::factory().create(_layer, getEngine(), extMgr, weightsCache));
for (const auto& op : orderedOps) {
const MKLDNNNodePtr node(MKLDNNNode::factory().create(op, getEngine(), extMgr, weightsCache));
graphNodes.push_back(node);
layer2node[layer] = node;
if (layer->params.count("originalLayersNames")) {
node->originalLayers = layer->params["originalLayersNames"];
if (op->get_type_info() == ngraph::op::v0::Parameter::type_info) {
if (inputsInfo.count(node->getName()) != 0) {
inputNodesMap[node->getName()] = node;
}
}
for (int port = 0; port < layer->insData.size(); port++) {
auto data = layer->insData[port].lock();
auto parent_layer = getCreatorLayer(data).lock();
if (!parent_layer) continue; // no parent means that it is input data node (or memory/const layer)
if (op->get_type_info() == ngraph::op::v0::Result::type_info) {
// [NM] TODO: Several network has model outputs which mismatch with result node name
const auto &input = op->input_value(0);
NGRAPH_SUPPRESS_DEPRECATED_START
auto name = input.get_tensor().get_name();
NGRAPH_SUPPRESS_DEPRECATED_END
if (name.empty()) {
name = ngraph::op::util::create_ie_output_name(input);
}
auto parent_node = layer2node[parent_layer];
if (outputsInfo.count(name) != 0) {
outputNodesMap[name] = node;
}
}
MKLDNNEdgePtr edge(new MKLDNNEdge(parent_node, node, _parent_port(data), port));
op2node[op] = node;
for (size_t port = 0; port < op->get_input_size(); port++) {
auto parentOp = op->get_input_node_shared_ptr(port);
auto parentNode = op2node[parentOp];
MKLDNNEdgePtr edge(new MKLDNNEdge(parentNode, node, getParentOutputPort(op, parentOp, port), static_cast<int>(port)));
node->addEdge(edge);
graphEdges.push_back(edge);
}
for (auto &out_data : layer->outData) {
if (getInputTo(out_data).empty()) {
unused_data.insert(out_data);
if (!MKLDNNPlugin::one_of(op->get_type_info(),
ngraph::op::v0::Result::type_info,
ngraph::op::v3::Assign::type_info,
ngraph::op::v6::Assign::type_info)) {
for (int oi = 0; oi < op->get_output_size(); oi++) {
if (op->get_output_target_inputs(oi).empty()) {
unusedOutputs.push_back(op->output(oi));
}
}
}
}
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "Outputs");
OutputsDataMap outputs = network.getOutputsInfo();
for (const auto &output : outputs) {
const auto data = output.second;
auto parent_layer = getCreatorLayer(data).lock();
auto parent_node = layer2node[parent_layer];
CNNLayerPtr layer(new CNNLayer({"out_" + output.first, "Output", data->getTensorDesc().getPrecision()}));
layer->insData.push_back(data);
const MKLDNNNodePtr node(MKLDNNNode::factory().create(layer, getEngine(), extMgr, weightsCache));
MKLDNNEdgePtr edge(new MKLDNNEdge(parent_node, node, _parent_port(data), 0));
node->addEdge(edge);
// Add stub output node for unused outputs
for (auto unusedOutput : unusedOutputs) {
auto parentNode = op2node[unusedOutput.get_node_shared_ptr()];
const auto port = unusedOutput.get_index();
const auto nodeName = std::string("stub_") + std::to_string(unusedOutput.get_index()) + "_" + parentNode->getName();
const MKLDNNNodePtr outNode = std::make_shared<MKLDNNInputNode>(parentNode->outDims[port].ToSizeVector(),
parentNode->getOriginalOutputPrecisionAtPort(port),
nodeName, "Result", getEngine(), weightsCache);
MKLDNNEdgePtr edge(new MKLDNNEdge(parentNode, outNode, port, 0));
outNode->addEdge(edge);
graphEdges.push_back(edge);
graphNodes.push_back(node);
outputNodes.push_back(node);
unused_data.erase(data);
graphNodes.push_back(outNode);
}
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "AddStubs");
// We set all non const data paths precision to BF16 in case enforceBF16 flag is switched on.
if (config.enforceBF16) {
bool isQuantizedModel = false;
for (auto& node : graphNodes) {
if (node->getType() == FakeQuantize)
isQuantizedModel = true;
}
// Add stub output node for unused data
for (auto to_stub_data : unused_data) {
auto parent_layer = getCreatorLayer(to_stub_data).lock();
auto parent_node = layer2node[parent_layer];
// Floating point parts of FP32 + INT8 or FP32 + BIN mixed precision models will be executed in BF16 precision
// only if enforceBF16 flag was set manually because current performance is not good enough to enable it by default
if (implication(isQuantizedModel, config.manualEnforceBF16)) {
for (auto &node : graphNodes) {
if (node->getType() != Input && node->getType() != Output) {
for (size_t i = 0; i < node->getOriginalInputsNumber(); i++) {
auto &parent = node->getParentEdgesAtPort(i)[0]->getParent();
if (!(parent->getType() == Input && parent->isConstant()) && node->getOriginalInputPrecisionAtPort(i) == Precision::FP32)
node->setOriginalInputPrecisionAtPort(i, Precision::BF16);
}
CNNLayerPtr layer(new CNNLayer({"stub_" + parent_layer->name, "Output", to_stub_data->getTensorDesc().getPrecision()}));
layer->insData.push_back(to_stub_data);
const MKLDNNNodePtr node(MKLDNNNode::factory().create(layer, getEngine(), extMgr, weightsCache));
MKLDNNEdgePtr edge(new MKLDNNEdge(parent_node, node, _parent_port(to_stub_data), 0));
node->addEdge(edge);
graphEdges.push_back(edge);
graphNodes.push_back(node);
}
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "Inputs");
// Replicate input nodes
for (const auto& input : inputs) {
auto inputLayer = getCreatorLayer(input.second->getInputData()).lock();
inputNodes[input.first] = layer2node[inputLayer];
// Loading mean images
MKLDNNDims outDims;
if (!inputNodes[input.first]->getChildEdgeAt(0)->getDims().ndims())
outDims = MKLDNNDims(InferenceEngine::SizeVector(1, 1));
else
outDims = MKLDNNDims(inputNodes[input.first]->getChildEdgeAt(0)->getDims());
if (inputs.find(input.first) != inputs.end()) {
InputInfo::Ptr ii = inputs[input.first];
if (ii && ii->getPreProcess().getNumberOfChannels()) {
_meanImages[input.first].Load(outDims, ii);
for (size_t i = 0; i < node->getOriginalOutputsNumber(); i++) {
if (node->getOriginalOutputPrecisionAtPort(i) == Precision::FP32)
node->setOriginalOutputPrecisionAtPort(i, Precision::BF16);
}
}
}
}
}
// change precision for input/output nodes to avoid extra data conversion when set input/output blobs
// also we need to change input/output precisions for consumers/producers to avoid inserting reorder
for (auto &input : inputNodesMap) {
const auto precToSet = normalizeToSupportedPrecision(inputsInfo.at(input.first)->getPrecision());
input.second->setOriginalOutputPrecisionAtPort(0, precToSet);
const auto childEdges = input.second->getChildEdgesAtPort(0);
for (size_t i = 0; i < childEdges.size(); i++) {
const auto child = childEdges[i]->getChild();
if (child->getOriginalInputPrecisionAtPort(childEdges[i]->getOutputNum()) != Precision::BF16)
child->setOriginalInputPrecisionAtPort(childEdges[i]->getOutputNum(), precToSet);
}
}
for (auto &output : outputNodesMap) {
const auto precToSet = normalizeToSupportedPrecision(outputsInfo.at(output.first)->getPrecision());
output.second->setOriginalInputPrecisionAtPort(0, precToSet);
const auto parentEdges = output.second->getParentEdgesAtPort(0);
for (size_t i = 0; i < parentEdges.size(); i++) {
const auto parent = parentEdges[i]->getParent();
parent->setOriginalOutputPrecisionAtPort(parentEdges[i]->getInputNum(), precToSet);
}
}
// Loading mean images
for (const auto& input : inputsInfo) {
MKLDNNDims outDims;
if (!inputNodesMap[input.first]->getChildEdgeAt(0)->getDims().ndims()) {
outDims = MKLDNNDims(InferenceEngine::SizeVector(1, 1));
} else {
outDims = inputNodesMap[input.first]->getChildEdgeAt(0)->getDims();
}
InputInfo::Ptr ii = inputsInfo[input.first];
if (ii && ii->getPreProcess().getNumberOfChannels()) {
_meanImages[input.first].Load(outDims, ii);
}
}
}
void MKLDNNGraph::InitGraph() {
@ -349,11 +350,6 @@ void MKLDNNGraph::InitGraph() {
CreatePrimitives();
SetOriginalLayerNames();
if (!config.dumpToDot.empty())
dumpToDotFile(config.dumpToDot + "_init.dot");
#ifndef CPU_DEBUG_CAPS
for (auto &graphNode : graphNodes) {
graphNode->cleanup();
@ -366,31 +362,6 @@ void MKLDNNGraph::InitGraph() {
ExecuteConstantNodesOnly();
}
void MKLDNNGraph::SetOriginalLayerNames() {
OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::MKLDNN_LT, "MKLDNNGraph::SetOriginalLayerNames");
// Do it before cleanup. Because it will lose original layers information
for (auto &graphNode : graphNodes) {
auto nodeType = graphNode->getType();
if (nodeType == Reorder || nodeType == Output) continue;
if (graphNode->getOriginalLayers().empty()) {
graphNode->addOriginalLayer(graphNode->getCnnLayer());
}
if (graphNode->getFusedWith().size() || graphNode->getMergeWith().size()) {
// Original layer names
std::vector<MKLDNNNodePtr> internal = graphNode->getFusedWith();
auto &merged = graphNode->getMergeWith();
internal.insert(internal.end(), merged.begin(), merged.end());
for (auto &sub_node : internal) {
graphNode->addOriginalLayer(sub_node->getCnnLayer());
}
}
}
}
void MKLDNNGraph::InitNodes() {
OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::MKLDNN_LT, "MKLDNNGraph::InitNodes");
for (auto &node : graphNodes) {
@ -500,7 +471,7 @@ void MKLDNNGraph::InitEdges() {
std::unordered_set<std::string> uniqueLayerNames;
for (auto node : graphNodes) {
uniqueLayerNames.insert(node->getCnnLayer()->name);
uniqueLayerNames.insert(node->getName());
}
for (auto i = 0; i < numberOfEdges; i++) {
@ -510,14 +481,17 @@ void MKLDNNGraph::InitEdges() {
// Check if there is a reorder that supports the type conversion
if (edge->getInputDesc().getPrecision() != edge->getOutputDesc().getPrecision() &&
!isReorderAvailable(edge->getInputDesc(), edge->getOutputDesc(), this->getEngine())) {
//If we are here, then we need to insert Convert, because there are no reorders that support such type conversion
std::string convertName = edge->getParent()->getName() + "_" +
edge->getInputDesc().getPrecision().name() + "_" + edge->getOutputDesc().getPrecision().name();
!isReorderAvailable(edge->getInputDesc(), edge->getOutputDesc(), this->getEngine())) {
// If we are here, then we need to insert Convert, because there are no reorders that support such type conversion
const auto inDesc = edge->getInputDesc();
const auto outDesc = edge->getOutputDesc();
CNNLayerPtr convert(new CNNLayer(LayerParams{convertName, "Convert", edge->getInputDesc().getPrecision()}));
auto convertNode = std::make_shared<MKLDNNConvertNode>(convert, this->getEngine(), this->weightsCache);
convertNode->setDescs(edge->getInputDesc(), edge->getOutputDesc());
std::string convertName = edge->getParent()->getName() + "_" +
inDesc.getPrecision().name() + "_" + outDesc.getPrecision().name();
auto convertNode = std::make_shared<MKLDNNConvertNode>(inDesc.getDims(), inDesc.getPrecision(), outDesc.getPrecision(), convertName,
this->getEngine(), this->weightsCache);
convertNode->setDescs(inDesc, outDesc);
InsertNode(edge, convertNode, true);
//Check if reorder is still needed
@ -741,8 +715,8 @@ void MKLDNNGraph::CreatePrimitives() {
void MKLDNNGraph::PushInputData(const std::string& name, const InferenceEngine::Blob::Ptr &in) {
if (!IsReady()) IE_THROW()<< "Wrong state. Topology not ready.";
auto input = inputNodes.find(name);
if (input != inputNodes.end()) {
auto input = inputNodesMap.find(name);
if (input != inputNodesMap.end()) {
MKLDNNDims outDims = input->second->getChildEdgeAt(0)->getDims();
const void *ext_data_ptr = in->cbuffer();
@ -774,11 +748,12 @@ void MKLDNNGraph::PullOutputData(BlobMap &out) {
if (!IsReady())
IE_THROW() << "Wrong state. Topology not ready.";
for (MKLDNNNodePtr &node : outputNodes) {
// remove out_ from node name
std::string name = node->getName().substr(4);
for (auto &outputMap : outputNodesMap) {
auto name = outputMap.first;
auto node = outputMap.second;
const MKLDNNMemory& intr_blob = node->getParentEdgeAt(0)->getMemory();
if (out.find(name) == out.end()) {
// TODO [NM]: Do we really need this path?
// TODO: Create blob from MemoryDesc
Blob::Ptr outBlob = make_shared_blob<float>({Precision::FP32, node->getParentEdgeAt(0)->getDims().ToSizeVector(),
TensorDesc::getLayoutByDims(node->getParentEdgeAt(0)->getDims().ToSizeVector())},
@ -816,7 +791,29 @@ void MKLDNNGraph::PullOutputData(BlobMap &out) {
MB_to_process = std::min<int>(config.batchLimit, MB_to_process);
size_t size_to_copy = intr_blob.GetElementsCount() * MB_to_process / MB;
cpu_convert(intr_blob_ptr, ext_blob_ptr, srcPrec, dstPrec, size_to_copy);
const auto actualDesc = node->getParentEdgeAt(0)->getDesc();
const auto expectedDesc = ext_blob->getTensorDesc();
// TODO [NM]: need to create universal reorder which will be detect cases when we really need to use it
// WA: for cases when output shape after transformation will be 1x1x1x1 but model output is scalar
bool isScalarOutput = false;
if (actualDesc.getLayout() == SCALAR) {
isScalarOutput = expectedDesc.getLayout() == SCALAR ||
std::accumulate(expectedDesc.getDims().begin(), expectedDesc.getDims().end(), (size_t)1, std::multiplies<size_t>()) == 1;
} else if (expectedDesc.getLayout() == SCALAR) {
isScalarOutput = actualDesc.getLayout() == SCALAR ||
std::accumulate(actualDesc.getDims().begin(), actualDesc.getDims().end(), (size_t)1, std::multiplies<size_t>()) == 1;
}
if (actualDesc.getBlockingDesc() != expectedDesc.getBlockingDesc() && !isScalarOutput) {
auto outBlobDesc = MKLDNNMemoryDesc{expectedDesc};
auto outBloMem = MKLDNNMemory(eng);
outBloMem.Create(outBlobDesc, ext_blob_ptr, false);
outBloMem.SetData(intr_blob, 0, false);
} else {
cpu_convert(intr_blob_ptr, ext_blob_ptr, srcPrec, dstPrec, size_to_copy);
}
}
}
@ -966,8 +963,6 @@ void MKLDNNGraph::GetPerfData(std::map<std::string, InferenceEngine::InferenceEn
for (int i = 1; i < graphNodes.size(); i++) {
getPerfMapFor(perfMap, graphNodes[i]);
}
if (!config.dumpToDot.empty()) dumpToDotFile(config.dumpToDot + "_perf.dot");
}
void MKLDNNGraph::setConfig(const Config &cfg) {
@ -983,18 +978,14 @@ Config MKLDNNGraph::getProperty() const {
}
void MKLDNNGraph::getInputBlobs(InferenceEngine::BlobMap &resp) {
for (auto &it : inputNodes) {
MKLDNNInputNode* node = dynamic_cast<MKLDNNInputNode*>(it.second.get());
if (!node || node->isConstant())
continue;
resp[it.first] = node->getChildEdgeAt(0)->getBlob();
for (auto &it : inputNodesMap) {
resp[it.first] = it.second->getChildEdgeAt(0)->getBlob();
}
}
void MKLDNNGraph::getOutputBlobs(InferenceEngine::BlobMap &resp) {
for (auto &it : outputNodes) {
std::string name = it->getName().substr(4);
resp[name] = it->getParentEdgeAt(0)->getBlob();
for (auto &it : outputNodesMap) {
resp[it.first] = it.second->getParentEdgeAt(0)->getBlob();
}
}
@ -1150,10 +1141,7 @@ void MKLDNNGraph::RemoveDroppedEdges() {
MKLDNNNodePtr MKLDNNGraph::InsertReorder(MKLDNNEdgePtr edge, std::string layerName, const TensorDesc& inDesc, const TensorDesc& outDesc,
bool isOptimized, InferenceEngine::Blob::Ptr scales) {
CNNLayerPtr layer(new CNNLayer({layerName,
"Reorder",
inDesc.getPrecision()}));
MKLDNNNodePtr newReorder(new MKLDNNReorderNode(layer, getEngine(), weightsCache));
MKLDNNNodePtr newReorder(new MKLDNNReorderNode(layerName, getEngine(), weightsCache));
auto *reorderPtr = dynamic_cast<MKLDNNReorderNode *>(newReorder.get());
if (reorderPtr == nullptr) {
IE_THROW() << "MKLDNNGraph::InsertReorder: Cannot cast to MKLDNNReorderNode";
@ -1165,7 +1153,7 @@ MKLDNNNodePtr MKLDNNGraph::InsertReorder(MKLDNNEdgePtr edge, std::string layerNa
InsertNode(edge, newReorder, true);
// Using the method MKLDNNEdge::getDesc() we can check that input and output tensor descriptors are equal.
// Due to the specificity of MKLDNNGraphOptimizer::MergePermuteAndReorder() that isOptimized flag uses, we shouldn't do these checks.
// Due to the specificity of MKLDNNGraphOptimizer::MergeTransposeAndReorder() that isOptimized flag uses, we shouldn't do these checks.
if (!isOptimized) {
newReorder->getParentEdgeAt(0)->getDesc();
newReorder->getChildEdgeAt(0)->getDesc();
@ -1218,15 +1206,6 @@ InferenceEngine::CNNNetwork MKLDNNGraph::dump() const {
return dump_graph_as_ie_ngraph_net(*this);
}
void MKLDNNGraph::dumpToDotFile(std::string file) const {
std::ofstream dot;
dot.open(file);
if (!dot.is_open()) IE_THROW() << "CPU Plugin cannot create dot file " << file << ".";
dump_graph_as_dot(*this, dot);
dot.close();
}
void MKLDNNGraph::printGraphInfo() const {
for (auto &graphNode : graphNodes) {
std::cout << "name: " << graphNode->getName() << " [ ";

View File

@ -4,14 +4,12 @@
#pragma once
#include "ie_parallel.hpp"
#include "cpp/ie_cnn_network.h"
#include "config.h"
#include "mkldnn_memory.h"
#include "mean_image.h"
#include "mkldnn_node.h"
#include "mkldnn_edge.h"
#include "threading/ie_thread_local.hpp"
#include <map>
#include <string>
#include <vector>
@ -48,7 +46,7 @@ public:
void getOutputBlobs(InferenceEngine::BlobMap &out_map);
template<typename NET>
void CreateGraph(const NET &network,
void CreateGraph(NET &network,
const MKLDNNExtensionManager::Ptr& extMgr,
MKLDNNWeightsSharing::Ptr &w_cache);
@ -73,15 +71,14 @@ public:
return graphEdges;
}
std::vector<MKLDNNNodePtr>& GetOutputNodes() {
return outputNodes;
std::map<std::string, MKLDNNNodePtr>& GetInputNodesMap() {
return inputNodesMap;
}
std::map<std::string, MKLDNNNodePtr>& GetInputNodes() {
return inputNodes;
std::map<std::string, MKLDNNNodePtr>& GetOutputNodesMap() {
return outputNodesMap;
}
mkldnn::engine getEngine() const {
return eng;
}
@ -152,9 +149,6 @@ public:
InferenceEngine::CNNNetwork dump() const;
template<typename NET>
static void ApplyUnrollPasses(NET &net);
void ResetInferCount() { infer_count = 0; }
void SortTopologically();
@ -166,8 +160,8 @@ protected:
status = NotReady;
eng = mkldnn::engine(mkldnn::engine::kind::cpu, 0);
inputNodes.clear();
outputNodes.clear();
inputNodesMap.clear();
outputNodesMap.clear();
graphNodes.clear();
graphEdges.clear();
_meanImages.clear();
@ -183,8 +177,8 @@ protected:
MKLDNNMemoryPtr memWorkspace;
std::map<std::string, MKLDNNNodePtr> inputNodes;
std::vector<MKLDNNNodePtr> outputNodes;
std::map<std::string, MKLDNNNodePtr> inputNodesMap;
std::map<std::string, MKLDNNNodePtr> outputNodesMap;
std::vector<MKLDNNNodePtr> graphNodes;
std::vector<MKLDNNEdgePtr> graphEdges;
@ -194,7 +188,7 @@ protected:
static mkldnn::engine eng;
void Replicate(const InferenceEngine::CNNNetwork &network, const MKLDNNExtensionManager::Ptr& extMgr);
void Replicate(const InferenceEngine::TensorIterator::Body &subgraph, const MKLDNNExtensionManager::Ptr& extMgr);
void Replicate(const std::shared_ptr<const ngraph::Function> &subgraph, const MKLDNNExtensionManager::Ptr& extMgr);
void InitGraph();
void InitNodes();
void InitDescriptors();
@ -204,22 +198,13 @@ protected:
void AllocateWithReuse();
void CreatePrimitives();
void ExecuteConstantNodesOnly();
void SetOriginalLayerNames();
friend class MKLDNNInferRequest;
friend class MKLDNNGraphlessInferRequest;
friend InferenceEngine::CNNNetwork dump_graph_as_ie_net(const MKLDNNGraph &graph);
friend InferenceEngine::CNNNetwork dump_graph_as_ie_ngraph_net(const MKLDNNGraph &graph);
private:
void dumpToDotFile(std::string file) const;
void printGraphInfo() const;
struct ParsedLayer {
MKLDNNNodePtr parent;
InferenceEngine::CNNLayerPtr cnnLayer;
size_t outIdx;
};
};
} // namespace MKLDNNPlugin

View File

@ -3,8 +3,6 @@
//
#include "mkldnn_graph_dumper.h"
#include <legacy/cnn_network_impl.hpp>
#include <legacy/ie_util_internal.hpp>
#include <ie_ngraph_utils.hpp>
#include "exec_graph_info.hpp"
#include "mkldnn_debug.h"
@ -22,188 +20,6 @@ namespace MKLDNNPlugin {
namespace {
std::map<std::string, std::string> extract_node_metadata(const MKLDNNNodePtr &);
void drawer_callback(const InferenceEngine::CNNLayerPtr, ordered_properties &, ordered_properties &);
} // namespace
CNNLayer::Ptr create_cnnlayer(const MKLDNNNodePtr &node) {
CNNLayer::Ptr layer(new CNNLayer({node->getName(), "type", Precision::FP32}));
layer->params = extract_node_metadata(node);
layer->type = layer->params[ExecGraphInfoSerialization::LAYER_TYPE];
layer->params.erase(ExecGraphInfoSerialization::LAYER_TYPE);
auto &cfg = node->getSelectedPrimitiveDescriptor()->getConfig();
layer->insData.resize(cfg.inConfs.size());
layer->outData.resize(cfg.outConfs.size());
return layer;
}
InferenceEngine::CNNNetwork dump_graph_as_ie_ngraph_net(const MKLDNNGraph &graph) {
std::map<MKLDNNNodePtr, std::shared_ptr<ngraph::Node> > node2layer;
ngraph::ResultVector results;
ngraph::ParameterVector params;
ngraph::NodeVector to_hold;
auto get_inputs = [&] (const MKLDNNNodePtr & node) {
auto pr_edges = node->getParentEdges();
ngraph::OutputVector inputs(pr_edges.size());
for (int i = 0; i < pr_edges.size(); i++) {
auto edge = node->getParentEdgeAt(i);
int pr_port = edge->getInputNum();
int ch_port = edge->getOutputNum();
auto pr_node = edge->getParent();
IE_ASSERT(node2layer.count(pr_node) == 1);
auto pr = node2layer[pr_node];
inputs[ch_port] = pr->output(pr_port);
}
return inputs;
};
auto create_ngraph_node = [&](const MKLDNNNodePtr &node) {
bool is_input = false, is_output = false, should_be_hold = false;
for (auto && kvp : graph.inputNodes) {
if (kvp.second == node) {
is_input = true;
break;
}
}
for (auto && onode : graph.outputNodes) {
if (onode == node) {
is_output = true;
break;
}
}
if (!is_output && node->getChildEdges().empty()) {
// The node has no consumer and is not an output.
// Should be hold in other irregular way.
should_be_hold = true;
}
auto meta_data = extract_node_metadata(node);
std::shared_ptr<ngraph::Node> return_node;
if (is_input) {
auto desc = node->getChildEdgeAt(0)->getDesc();
auto param = std::make_shared<ngraph::op::Parameter>(
details::convertPrecision(desc.getPrecision()),
ngraph::PartialShape(desc.getDims()));
return_node = param;
params.push_back(param);
} else if (is_output) {
results.emplace_back(std::make_shared<ngraph::op::Result>(get_inputs(node).back()));
return_node = results.back();
} else {
return_node = std::make_shared<ExecGraphInfoSerialization::ExecutionNode>(
get_inputs(node), node->getSelectedPrimitiveDescriptor()->getConfig().outConfs.size());
for (size_t port = 0; port < return_node->get_output_size(); ++port) {
auto desc = node->getChildEdgeAt(port)->getDesc();
return_node->set_output_type(port,
details::convertPrecision(desc.getPrecision()),
ngraph::PartialShape(desc.getDims()));
}
}
if (should_be_hold) {
to_hold.push_back(return_node);
}
for (auto && kvp : meta_data)
return_node->get_rt_info()[kvp.first] = std::make_shared<::ngraph::VariantWrapper<std::string>>(kvp.second);
return_node->set_friendly_name(node->getName());
return return_node;
};
ngraph::NodeVector nodes;
nodes.reserve(graph.graphNodes.size());
for (auto &node : graph.graphNodes) { // important: graph.graphNodes are in topological order
nodes.emplace_back(create_ngraph_node(node));
node2layer[node] = nodes.back();
}
auto holder = results[0];
for (auto &node : to_hold) {
holder->add_control_dependency(node);
}
auto function = std::make_shared<ngraph::Function>(results, params, graph._name);
InferenceEngine::CNNNetwork net(function);
return net;
}
InferenceEngine::CNNNetwork dump_graph_as_ie_net(const MKLDNNGraph &graph) {
auto net = std::make_shared<details::CNNNetworkImpl>();
net->setName(graph._name);
std::map<MKLDNNNodePtr, CNNLayerPtr> node2layer;
// Copy all nodes to network
for (auto &node : graph.graphNodes) {
auto layer = create_cnnlayer(node);
node2layer[node] = layer;
net->addLayer(layer);
}
// Copy all edges to network
for (auto &node : graph.graphNodes) {
auto pr = node2layer[node];
auto ch_edges = node->getChildEdges();
for (int i = 0; i < ch_edges.size(); i++) {
auto edge = node->getChildEdgeAt(i);
int in_port = edge->getOutputNum();
auto ch_node = edge->getChild();
auto ch = node2layer[ch_node];
DataPtr data;
if (i < pr->outData.size()) {
std::string data_name = node->getName() + "_out" + std::to_string(i);
pr->outData[i] = std::make_shared<Data>(data_name, edge->getDesc());
data = pr->outData[i];
getCreatorLayer(data) = pr;
} else {
data = pr->outData[0];
}
getInputTo(data)[ch->name] = ch;
ch->insData[in_port] = data;
}
}
// Specify inputs data
for (auto kvp : graph.inputNodes) {
auto in_node = kvp.second;
auto in_layer = node2layer[in_node];
auto in_info = std::make_shared<InputInfo>();
in_info->setInputData(in_layer->outData[0]);
net->setInputInfo(in_info);
}
return InferenceEngine::CNNNetwork{net};
}
void dump_graph_as_dot(const MKLDNNGraph &graph, std::ostream &out) {
InferenceEngine::CNNNetwork dump_net = dump_graph_as_ie_net(graph);
InferenceEngine::saveGraphToDot(dump_net, out, drawer_callback);
}
//**********************************
// Special converters of meta data
//**********************************
namespace {
std::map<std::string, std::string> extract_node_metadata(const MKLDNNNodePtr &node) {
std::map<std::string, std::string> serialization_info;
@ -289,39 +105,106 @@ std::map<std::string, std::string> extract_node_metadata(const MKLDNNNodePtr &no
return serialization_info;
}
const char BLUE[] = "#D8D9F1";
const char GREEN[] = "#D9EAD3";
void drawer_callback(const InferenceEngine::CNNLayerPtr layer,
ordered_properties &printed_properties,
ordered_properties &node_properties) {
const auto &params = layer->params;
// Implementation
auto impl = params.find(ExecGraphInfoSerialization::IMPL_TYPE);
if (impl != params.end()) {
printed_properties.push_back({"impl", impl->second});
}
// Original names
auto orig = params.find(ExecGraphInfoSerialization::ORIGINAL_NAMES);
if (orig != params.end()) {
printed_properties.push_back({"originals", orig->second});
}
// Precision
auto prec = params.find(ExecGraphInfoSerialization::OUTPUT_PRECISIONS);
if (prec != params.end()) {
printed_properties.push_back({"precision", prec->second});
// Set color
node_properties.push_back({"fillcolor", prec->second == "FP32" ? GREEN : BLUE});
}
// Set xlabel containing PM data if calculated
auto perf = layer->params.find(ExecGraphInfoSerialization::PERF_COUNTER);
node_properties.push_back({"xlabel", (perf != layer->params.end()) ? perf->second : ""});
}
} // namespace
InferenceEngine::CNNNetwork dump_graph_as_ie_ngraph_net(const MKLDNNGraph &graph) {
std::map<MKLDNNNodePtr, std::shared_ptr<ngraph::Node> > node2layer;
ngraph::ResultVector results;
ngraph::ParameterVector params;
ngraph::NodeVector to_hold;
auto get_inputs = [&] (const MKLDNNNodePtr & node) {
auto pr_edges = node->getParentEdges();
ngraph::OutputVector inputs(pr_edges.size());
for (int i = 0; i < pr_edges.size(); i++) {
auto edge = node->getParentEdgeAt(i);
int pr_port = edge->getInputNum();
int ch_port = edge->getOutputNum();
auto pr_node = edge->getParent();
IE_ASSERT(node2layer.count(pr_node) == 1);
auto pr = node2layer[pr_node];
inputs[ch_port] = pr->output(pr_port);
}
return inputs;
};
auto create_ngraph_node = [&](const MKLDNNNodePtr &node) {
bool is_input = false, is_output = false, should_be_hold = false;
for (auto && kvp : graph.inputNodesMap) {
if (kvp.second == node) {
is_input = true;
break;
}
}
for (auto && kvp : graph.outputNodesMap) {
if (kvp.second == node) {
is_output = true;
break;
}
}
if (!is_output && node->getChildEdges().empty()) {
// The node has no consumer and is not an output.
// Should be hold in other irregular way.
should_be_hold = true;
}
auto meta_data = extract_node_metadata(node);
std::shared_ptr<ngraph::Node> return_node;
if (is_input) {
auto desc = node->getChildEdgeAt(0)->getDesc();
auto param = std::make_shared<ngraph::op::Parameter>(
details::convertPrecision(desc.getPrecision()),
ngraph::PartialShape(desc.getDims()));
return_node = param;
params.push_back(param);
} else if (is_output) {
results.emplace_back(std::make_shared<ngraph::op::Result>(get_inputs(node).back()));
return_node = results.back();
} else {
return_node = std::make_shared<ExecGraphInfoSerialization::ExecutionNode>(
get_inputs(node), node->getSelectedPrimitiveDescriptor()->getConfig().outConfs.size());
for (size_t port = 0; port < return_node->get_output_size(); ++port) {
auto desc = node->getChildEdgeAt(port)->getDesc();
return_node->set_output_type(port,
details::convertPrecision(desc.getPrecision()),
ngraph::PartialShape(desc.getDims()));
}
}
if (should_be_hold) {
to_hold.push_back(return_node);
}
for (auto && kvp : meta_data)
return_node->get_rt_info()[kvp.first] = std::make_shared<::ngraph::VariantWrapper<std::string>>(kvp.second);
return_node->set_friendly_name(node->getName());
return return_node;
};
ngraph::NodeVector nodes;
nodes.reserve(graph.graphNodes.size());
for (auto &node : graph.graphNodes) { // important: graph.graphNodes are in topological order
nodes.emplace_back(create_ngraph_node(node));
node2layer[node] = nodes.back();
}
auto holder = results[0];
for (auto &node : to_hold) {
holder->add_control_dependency(node);
}
auto function = std::make_shared<ngraph::Function>(results, params, graph._name);
InferenceEngine::CNNNetwork net(function);
return net;
}
} // namespace MKLDNNPlugin

View File

@ -11,9 +11,6 @@
namespace MKLDNNPlugin {
void dump_graph_as_dot(const MKLDNNGraph &graph, std::ostream &out);
InferenceEngine::CNNNetwork dump_graph_as_ie_net(const MKLDNNGraph &graph);
InferenceEngine::CNNNetwork dump_graph_as_ie_ngraph_net(const MKLDNNGraph &graph);
} // namespace MKLDNNPlugin

File diff suppressed because it is too large Load Diff

View File

@ -19,36 +19,26 @@ public:
void ApplyImplSpecificGraphOptimizations(MKLDNNGraph& graph);
private:
void MergeGroupConvolution(MKLDNNGraph& graph);
void MergeTwoEqualScaleShifts(MKLDNNGraph& graph);
void FuseConvolutionAndActivation(MKLDNNGraph &graph);
void FuseConvolutionAndBias(MKLDNNGraph &graph);
void FuseDeconvolutionAndSimpleOperation(MKLDNNGraph &graph);
void FuseMultiplyAndAdd(MKLDNNGraph &graph);
void FuseFullyConnectedAndSimpleOperation(MKLDNNGraph &graph);
void FuseConvolutionAndDepthwise(MKLDNNGraph &graph);
void FuseConvolutionAndSimpleOperationThroughMaxPool(MKLDNNGraph &graph);
void FuseConvolutionAndSimpleOperation(MKLDNNGraph &graph);
void FuseConvolutionAndDWConvolution(MKLDNNGraph &graph);
void FuseConvolutionAndQuantize(MKLDNNGraph &graph);
void FuseBinaryConvolutionAndQuantize(MKLDNNGraph &graph);
void FusePoolingAndQuantize(MKLDNNGraph &graph);
void FuseBatchNormWithScale(MKLDNNGraph& graph);
void FusePoolingAndFakeQuantize(MKLDNNGraph &graph);
void FuseConvolutionSumAndConvolutionSumActivation(MKLDNNGraph &graph);
void FuseMVNAndSimpleOperation(MKLDNNGraph &graph);
void FuseInterpolateAndSimpleOperation(MKLDNNGraph &graph);
void FuseNormalizeAndSimpleOperation(MKLDNNGraph &graph);
void RemoveIdentityOperator(MKLDNNGraph& graph);
void FuseNormalizeL2AndSimpleOperation(MKLDNNGraph &graph);
void RemoveIOScaleShifts(MKLDNNGraph& graph);
void DropDoubleReorders(MKLDNNGraph& graph);
void DropConvertReorder(MKLDNNGraph& graph);
void AddConvertToReorder(MKLDNNGraph &graph);
void FuseConvolutionAndZeroPoints(MKLDNNGraph &graph);
void FuseBroadcastAndEltwise(MKLDNNGraph &graph);
void FuseEltwiseAndSimple(MKLDNNGraph &graph);
void FuseScaleShiftAndQuantize(MKLDNNGraph &graph);
void FuseClampAndQuantize(MKLDNNGraph &graph);
void MergePermuteAndReorder(MKLDNNGraph &graph);
bool IsOneOf(Type type, std::vector<Type> types);
bool IsOneOf(EltwiseOpType alg, std::vector<EltwiseOpType> algs);
void FuseMulAddAndFakeQuantize(MKLDNNGraph &graph);
void FuseClampAndFakeQuantize(MKLDNNGraph &graph);
void MergeTransposeAndReorder(MKLDNNGraph &graph);
void removeEdge(MKLDNNGraph &graph, MKLDNNEdgePtr& edge);
};

View File

@ -20,7 +20,8 @@
#include "nodes/common/cpu_memcpy.h"
#include "mkldnn_async_infer_request.h"
#include <debug.h>
#include "utils/general_utils.h"
#include "utils/cpu_utils.hpp"
MKLDNNPlugin::MKLDNNInferRequest::MKLDNNInferRequest(InferenceEngine::InputsDataMap networkInputs,
InferenceEngine::OutputsDataMap networkOutputs,
@ -103,33 +104,14 @@ void MKLDNNPlugin::MKLDNNInferRequest::PushInputData() {
IE_THROW() << "Input blobs map contains not registered during IInferencePlugin::LoadNetwork blob with name " << input.first;
}
auto inPrec = input.second->getTensorDesc().getPrecision();
if (graph->hasMeanImageFor(input.first) && one_of(inPrec, InferenceEngine::Precision::U8, InferenceEngine::Precision::BOOL)) {
inPrec = InferenceEngine::Precision::FP32;
} else {
inPrec = normalizeToSupportedPrecision(inPrec);
}
switch (inPrec) {
// these precisions are supported by mkldnn, so we push the blob directly
case InferenceEngine::Precision::I8:
case InferenceEngine::Precision::I32:
case InferenceEngine::Precision::BF16:
case InferenceEngine::Precision::FP32: {
break;
}
// these precisions are supported by mkldnn, so we push the blob directly
// BUT if a mean image exists, we convert the blob and send FP32
case InferenceEngine::Precision::U8:
case InferenceEngine::Precision::BOOL: {
if (graph->hasMeanImageFor(input.first))
inPrec = InferenceEngine::Precision::FP32;
break;
}
// these precisions are unsupported by mkldnn, so we convert the blob and send I32
case InferenceEngine::Precision::U16:
case InferenceEngine::Precision::I16:
case InferenceEngine::Precision::I64:
case InferenceEngine::Precision::U64: {
inPrec = InferenceEngine::Precision::I32;
break;
}
default:
IE_THROW() << "Unsupported input precision " << input.second->getTensorDesc().getPrecision();
if (inPrec == InferenceEngine::Precision::UNSPECIFIED) {
IE_THROW() << "Unsupported input precision " << input.second->getTensorDesc().getPrecision();
}
// User can initialize input via setBlob API using tensorDesc with default (ANY) layout.
@ -246,7 +228,6 @@ InferenceEngine::Blob::Ptr MKLDNNPlugin::MKLDNNInferRequest::GetBlob(const std::
}
InferenceEngine::TensorDesc desc = blobs[name]->getTensorDesc();
InferenceEngine::Precision originPrecision = blobs[name]->getTensorDesc().getPrecision();
if (_networkInputs.find(name) != _networkInputs.end()) {
InferenceEngine::Layout l = _networkInputs[name]->getLayout();
InferenceEngine::Precision p = _networkInputs[name]->getPrecision();
@ -257,7 +238,7 @@ InferenceEngine::Blob::Ptr MKLDNNPlugin::MKLDNNInferRequest::GetBlob(const std::
_inputs[name] = make_blob_with_precision(desc);
_inputs[name]->allocate();
if (desc.getPrecision() == originPrecision &&
if (blobs[name]->getTensorDesc() == desc &&
graph->_meanImages.find(name) == graph->_meanImages.end() && !graph->getProperty().batchLimit) {
externalPtr[name] = _inputs[name]->buffer();
}
@ -274,7 +255,8 @@ InferenceEngine::Blob::Ptr MKLDNNPlugin::MKLDNNInferRequest::GetBlob(const std::
return data;
}
InferenceEngine::TensorDesc desc = blobs[name]->getTensorDesc();
InferenceEngine::TensorDesc desc = _networkOutputs[name]->getTensorDesc();
desc.setPrecision(normalizeToSupportedPrecision(desc.getPrecision()));
// WA: need to avoid exception thrown when we compare blocking desc in SetBlob
// in situation if we push output blobs as inputs for next network (in Hetero plugin)
@ -285,7 +267,7 @@ InferenceEngine::Blob::Ptr MKLDNNPlugin::MKLDNNInferRequest::GetBlob(const std::
_outputs[name] = make_blob_with_precision(desc);
_outputs[name]->allocate();
if (desc.getPrecision() == InferenceEngine::Precision::FP32 && !graph->getProperty().batchLimit) {
if (blobs[name]->getTensorDesc() == desc && !graph->getProperty().batchLimit) {
externalPtr[name] = _outputs[name]->buffer();
}
data = _outputs[name];
@ -351,7 +333,12 @@ void MKLDNNPlugin::MKLDNNInferRequest::SetBlob(const std::string& name, const In
IE_THROW(ParameterMismatch) << "Failed to set input blob. Blocking descriptor mismatch.";
}
if (data->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP32 &&
InferenceEngine::BlobMap blobs;
graph->getInputBlobs(blobs);
if (blobs.find(name) == blobs.end())
IE_THROW() << "MKLDNN graph doesn't contain input node with name: " << name;
if (data->getTensorDesc() == blobs.at(name)->getTensorDesc() &&
graph->_meanImages.find(name) == graph->_meanImages.end() && !graph->getProperty().batchLimit) {
externalPtr[name] = data->buffer();
} else if (externalPtr.find(name) != externalPtr.end()) {
@ -382,7 +369,13 @@ void MKLDNNPlugin::MKLDNNInferRequest::SetBlob(const std::string& name, const In
foundOutput->getTensorDesc().getBlockingDesc() != data->getTensorDesc().getBlockingDesc()) {
IE_THROW(ParameterMismatch) << "Failed to set output blob. Blocking descriptor mismatch.";
}
if (data->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP32 &&
InferenceEngine::BlobMap blobs;
graph->getOutputBlobs(blobs);
if (blobs.find(name) == blobs.end())
IE_THROW() << "MKLDNN graph doesn't contain output node with name: " << name;
if (data->getTensorDesc() == blobs.at(name)->getTensorDesc() &&
!graph->getProperty().batchLimit) {
externalPtr[name] = data->buffer();
} else if (externalPtr.find(name) != externalPtr.end()) {
@ -398,8 +391,8 @@ static inline void changeEdgePtr(const MKLDNNPlugin::MKLDNNEdgePtr &edge, void *
void MKLDNNPlugin::MKLDNNInferRequest::changeDefaultPtr() {
for (auto& it : externalPtr) {
auto input = graph->inputNodes.find(it.first);
if (input != graph->inputNodes.end()) {
auto input = graph->inputNodesMap.find(it.first);
if (input != graph->inputNodesMap.end()) {
if (input->second->getChildEdgeAt(0)->getMemory().GetPrimitive().get_data_handle() == it.second)
continue;
// Input cannot be in-place with other primitives
@ -432,9 +425,9 @@ void MKLDNNPlugin::MKLDNNInferRequest::changeDefaultPtr() {
}
MKLDNNNodePtr output;
for (auto& out : graph->outputNodes) {
if (out->getName() == "out_" + it.first) {
output = out;
for (auto& out : graph->outputNodesMap) {
if (out.first == it.first) {
output = out.second;
break;
}
}
@ -493,4 +486,4 @@ void MKLDNNPlugin::MKLDNNInferRequest::ThrowIfCanceled() const {
if (_asyncRequest != nullptr) {
_asyncRequest->ThrowIfCanceled();
}
}
}

View File

@ -489,8 +489,8 @@ static const std::map<int, std::vector<mkldnn::memory::format_tag>> form_tags_by
mkldnn::memory::format_tag::aBCde4c8b2c,
}}, {6, { // Popular
mkldnn::memory::format_tag::abcdef, // plain
mkldnn::memory::format_tag::acbdef, // permuted
mkldnn::memory::format_tag::defcab, // permuted
mkldnn::memory::format_tag::acbdef, // permute
mkldnn::memory::format_tag::defcab, // permute
mkldnn::memory::format_tag::aBcdef16b, // blocked 16c
mkldnn::memory::format_tag::aBCdef16b16c,
@ -565,18 +565,46 @@ bool MKLDNNMemoryDesc::isSame(mkldnn::memory::format_tag fmt) const {
auto refStrides = refDesc.data.format_desc.blocking.strides;
std::vector<size_t> actualOrder(desc.data.ndims);
std::iota(actualOrder.begin(), actualOrder.end(), 0);
std::sort(actualOrder.begin(), actualOrder.end(),
[&actualStrides] (size_t ind_l, size_t ind_r) {
return actualStrides[ind_l] > actualStrides[ind_r];
});
{
const auto dims = desc.dims();
std::vector<size_t> total_block_per_dim(dims.size(), 1);
const auto &blk_desc = desc.data.format_desc.blocking;
for (int i = 0; i < blk_desc.inner_nblks; i++) {
total_block_per_dim[blk_desc.inner_idxs[i]] *= blk_desc.inner_blks[i];
}
std::vector<size_t> outer_block_dims(std::begin(dims), std::begin(dims) + dims.size());
for (size_t i = 0; i < outer_block_dims.size(); i++) {
outer_block_dims[i] = div_up(outer_block_dims[i], total_block_per_dim[i]);
}
std::iota(actualOrder.begin(), actualOrder.end(), 0);
std::sort(actualOrder.begin(), actualOrder.end(),
[&actualStrides, &outer_block_dims] (size_t ind_l, size_t ind_r) {
return (actualStrides[ind_l] > actualStrides[ind_r]) ||
(actualStrides[ind_l] == actualStrides[ind_r] && outer_block_dims[ind_l] > outer_block_dims[ind_r]);
});
}
std::vector<size_t> refOrder(refDesc.data.ndims);
std::iota(refOrder.begin(), refOrder.end(), 0);
std::sort(refOrder.begin(), refOrder.end(),
[&refStrides] (size_t ind_l, size_t ind_r) {
return refStrides[ind_l] > refStrides[ind_r];
});
{
const auto dims = refDesc.dims();
std::vector<size_t> total_block_per_dim(dims.size(), 1);
const auto &blk_desc = refDesc.data.format_desc.blocking;
for (int i = 0; i < blk_desc.inner_nblks; i++) {
total_block_per_dim[blk_desc.inner_idxs[i]] *= blk_desc.inner_blks[i];
}
std::vector<size_t> outer_block_dims(std::begin(dims), std::begin(dims) + dims.size());
for (size_t i = 0; i < outer_block_dims.size(); i++) {
outer_block_dims[i] = div_up(outer_block_dims[i], total_block_per_dim[i]);
}
std::iota(refOrder.begin(), refOrder.end(), 0);
std::sort(refOrder.begin(), refOrder.end(),
[&refStrides, &outer_block_dims] (size_t ind_l, size_t ind_r) {
return (refStrides[ind_l] > refStrides[ind_r]) ||
(refStrides[ind_l] == refStrides[ind_r] && outer_block_dims[ind_l] > outer_block_dims[ind_r]);
});
}
if (actualOrder != refOrder) {
return false;
@ -682,14 +710,6 @@ MKLDNNMemoryDesc::operator InferenceEngine::TensorDesc() const {
const size_t inner_ndims = blk_desc.inner_nblks;
const size_t total_ndims = outer_ndims + inner_ndims;
// order of outer dims. In case of IOhw_ will be {1, 0, 2, 3}
std::vector<size_t> outer_order(outer_ndims);
std::iota(outer_order.begin(), outer_order.end(), 0);
std::sort(outer_order.begin(), outer_order.end(),
[&blk_desc] (size_t ind_l, size_t ind_r) {
return blk_desc.strides[ind_l] > blk_desc.strides[ind_r];
});
// strides of inner dims. In case of 4i16o4i will be {64, 4, 1}
std::vector<size_t> inner_strides(inner_ndims, 1);
for (size_t i = 1; i < blk_desc.inner_nblks; i++) {
@ -701,6 +721,19 @@ MKLDNNMemoryDesc::operator InferenceEngine::TensorDesc() const {
for (int i = 0; i < inner_ndims; i++) {
total_block_per_dim[blk_desc.inner_idxs[i]] *= blk_desc.inner_blks[i];
}
std::vector<size_t> outer_block_dims(std::begin(dims), std::begin(dims) + outer_ndims);
for (size_t i = 0; i < outer_block_dims.size(); i++) {
outer_block_dims[i] = div_up(outer_block_dims[i], total_block_per_dim[i]);
}
// order of outer dims. In case of IOhw_ will be {1, 0, 2, 3}
std::vector<size_t> outer_order(outer_ndims);
std::iota(outer_order.begin(), outer_order.end(), 0);
std::sort(outer_order.begin(), outer_order.end(),
[&blk_desc, &outer_block_dims] (size_t ind_l, size_t ind_r) {
return (blk_desc.strides[ind_l] > blk_desc.strides[ind_r]) ||
(blk_desc.strides[ind_l] == blk_desc.strides[ind_r] && outer_block_dims[ind_l] > outer_block_dims[ind_r]);
});
// IE blocked order
// [new_outer_order] U [inner_idxs]
@ -721,7 +754,7 @@ MKLDNNMemoryDesc::operator InferenceEngine::TensorDesc() const {
std::copy(blk_desc.inner_blks, blk_desc.inner_blks + blk_desc.inner_nblks,
ie_blk_dims.end() - blk_desc.inner_nblks);
std::transform(outer_order.begin(), outer_order.end(), ie_blk_dims.begin(),
[&] (size_t i) { return div_up(dims[i], total_block_per_dim[i]); });
[&] (size_t i) { return outer_block_dims[i]; });
// IE offset padded to data. Same as for oneDNN
SizeVector ie_blk_offset_to_data {desc.data.padded_offsets, desc.data.padded_offsets + desc.data.ndims};
@ -742,7 +775,7 @@ MKLDNNMemoryDesc::operator InferenceEngine::TensorDesc() const {
MKLDNNMemory::convertToIePrec(desc.data_type()),
SizeVector {begin(dims), end(dims)},
ie_blk_desc };
// TODO: BLOCKED is the most common layout which covers all other permuted layout like NHWC.
// TODO: BLOCKED is the most common layout which covers all other permute layout like NHWC.
// But for some cases we have to specify it more correctly.. may be.. or just keep
// auto detected layout in constructor of TensorDesc.
return res;
@ -809,7 +842,7 @@ MKLDNNMemoryDesc::MKLDNNMemoryDesc(const TensorDesc& tDesc):
is_descending_strides &= (ie_strides[i-1] >= ie_strides[i]);
}
// TODO: That's strong constrains and can be mitigated. IE::TensorDesc allow to permute blocked dims
// TODO: That's strong constrains and can be mitigated. IE::TensorDesc allow to transpose blocked dims
// and may be we can achieve correct "descending strides" form which allow conversion.
if (!is_descending_strides)
IE_THROW() << "Unsupported case for conversion";

View File

@ -13,12 +13,11 @@
#include <cstdint>
#include <unordered_map>
#include <nodes/mkldnn_batchnorm_node.h>
#include <nodes/mkldnn_concat_node.h>
#include <nodes/mkldnn_conv_node.h>
#include <nodes/mkldnn_deconv_node.h>
#include <nodes/mkldnn_eltwise_node.h>
#include <nodes/mkldnn_gemm_node.h>
#include <nodes/mkldnn_matmul_node.h>
#include <nodes/mkldnn_fullyconnected_node.h>
#include <nodes/mkldnn_generic_node.h>
#include <nodes/mkldnn_input_node.h>
@ -30,7 +29,7 @@
#include <nodes/mkldnn_tile_node.h>
#include <nodes/mkldnn_split_node.h>
#include <nodes/mkldnn_pad_node.h>
#include <nodes/mkldnn_permute_node.h>
#include <nodes/mkldnn_transpose_node.h>
#include <nodes/mkldnn_memory_node.hpp>
#include <nodes/mkldnn_mvn_node.h>
#include <nodes/mkldnn_normalize_node.h>
@ -41,6 +40,8 @@
#include <nodes/mkldnn_depth_to_space_node.h>
#include <nodes/mkldnn_space_to_depth_node.h>
#include <nodes/mkldnn_strided_slice_node.h>
#include <nodes/mkldnn_reference_node.h>
#include <nodes/mkldnn_fake_quantize_node.h>
#include <mkldnn_types.h>
#include <dnnl_types.h>
#include "mkldnn_extension_utils.h"
@ -49,6 +50,10 @@
#include "mkldnn_debug.h"
#include "utils/rt_info/memory_formats_attribute.hpp"
#include <ie_ngraph_utils.hpp>
#include "utils/general_utils.h"
#include "utils/cpu_utils.hpp"
using namespace mkldnn;
using namespace MKLDNNPlugin;
using namespace openvino;
@ -56,94 +61,120 @@ using namespace openvino;
using namespace InferenceEngine::details;
namespace MKLDNNPlugin {
static const InferenceEngine::details::caseless_unordered_map<std::string, Type> type_to_name_tbl = {
{ "Unknown", Unknown },
{ "Input", Input },
{ "Const", Input },
{ "Output", Output },
{ "Reorder", Reorder },
{ "Constant", Input },
{ "Parameter", Input },
{ "Result", Output },
{ "Convolution", Convolution },
{ "ReLU", Eltwise },
{ "GELU", Eltwise },
{ "ELU", Eltwise },
{ "GroupConvolution", Convolution },
{ "MatMul", MatMul },
{ "FullyConnected", FullyConnected },
{ "MaxPool", Pooling },
{ "AvgPool", Pooling },
{ "Add", Eltwise },
{ "Subtract", Eltwise },
{ "Multiply", Eltwise },
{ "Divide", Eltwise },
{ "SquaredDifference", Eltwise },
{ "Maximum", Eltwise },
{ "Minimum", Eltwise },
{ "Mod", Eltwise },
{ "FloorMod", Eltwise },
{ "Power", Eltwise },
{ "PowerStatic", Eltwise },
{ "Equal", Eltwise },
{ "NotEqual", Eltwise },
{ "Greater", Eltwise },
{ "GreaterEqual", Eltwise },
{ "Less", Eltwise },
{ "LessEqual", Eltwise },
{ "LogicalAnd", Eltwise },
{ "LogicalOr", Eltwise },
{ "LogicalXor", Eltwise },
{ "LogicalNot", Eltwise },
{ "Relu", Eltwise },
{ "LeakyRelu", Eltwise },
{ "Gelu", Eltwise },
{ "Elu", Eltwise },
{ "Tanh", Eltwise },
{ "Sigmoid", Eltwise },
{ "Logistic", Eltwise },
{ "TanH", Eltwise },
{ "ReLU6", Eltwise },
{ "Exp", Eltwise },
{ "Not", Eltwise },
{ "Activation", Eltwise },
{ "Abs", Eltwise },
{ "Sqrt", Eltwise },
{ "Clamp", Eltwise },
{ "Swish", Eltwise },
{ "Exp", Eltwise },
{ "SwishCPU", Eltwise },
{ "HSwish", Eltwise },
{ "Mish", Eltwise },
{ "HSigmoid", Eltwise },
{ "Round", Eltwise },
{ "ScaleShift", Eltwise },
{ "PReLU", Eltwise },
{ "PRelu", Eltwise },
{ "Erf", Eltwise },
{ "SoftPlus", Eltwise },
{ "Norm", Lrn },
{ "LRN", Lrn },
{ "Pooling", Pooling },
{ "FullyConnected", FullyConnected },
{ "InnerProduct", FullyConnected },
{ "Gemm", Gemm },
{ "Softmax", SoftMax },
{ "SoftMax", SoftMax },
{ "Split", Split },
{ "Slice", Split },
{ "Concat", Concatenation },
{ "Deconvolution", Deconvolution },
{ "Eltwise", Eltwise },
{ "Mod", Eltwise },
{ "Power", Eltwise },
{ "Reshape", Reshape },
{ "Squeeze", Reshape },
{ "Unsqueeze", Reshape },
{ "Softmax", Softmax },
{ "Reorder", Reorder },
{ "BatchToSpace", BatchToSpace },
{ "SpaceToBatch", SpaceToBatch },
{ "DepthToSpace", DepthToSpace },
{ "SpaceToDepth", SpaceToDepth },
{ "Roll", Roll },
{ "LRN", Lrn },
{ "Split", Split },
{ "VariadicSplit", Split },
{ "Concat", Concatenation },
{ "ConvolutionBackpropData", Deconvolution },
{ "GroupConvolutionBackpropData", Deconvolution },
{ "StridedSlice", StridedSlice },
{ "Tile", Tile },
{ "SimplerNMS", SimplerNMS },
{ "ROIAlign", ROIAlign },
{ "ROIPooling", ROIPooling },
{ "BatchNormalization", BatchNormalization },
{ "DepthToSpace", DepthToSpace },
{ "Flatten", Flatten },
{ "PSROIPooling", PSROIPooling },
{ "DeformablePSROIPooling", PSROIPooling },
{ "Pad", Pad },
{ "Permute", Permute },
{ "SpaceToDepth", SpaceToDepth },
{ "StridedSlice", StridedSlice },
{ "Copy", Copy },
{ "Transpose", Transpose },
{ "LSTMCell", RNNCell },
{ "GRUCell", RNNCell },
{ "RNNCell", RNNCell },
{ "LSTMSequence", RNNSeq },
{ "GRUSequence", RNNSeq },
{ "RNNSequence", RNNSeq },
{ "Quantize", Quantize },
{ "FakeQuantize", Quantize },
{ "FakeQuantize", FakeQuantize },
{ "BinaryConvolution", BinaryConvolution },
{ "DeformableConvolution", DeformableConvolution },
{ "TensorIterator", TensorIterator },
{ "Loop", TensorIterator },
{ "MemoryInput", MemoryInput}, // for construction from name ctor, arbitrary name is used
{ "Memory", MemoryOutput }, // for construction from layer ctor
{ "ReadValue", MemoryInput}, // for construction from name ctor, arbitrary name is used
{ "Assign", MemoryOutput }, // for construction from layer ctor
{ "Convert", Convert },
{ "MVN", MVN},
{ "Normalize", Normalize},
{ "NormalizeL2", NormalizeL2},
{ "ScatterUpdate", ScatterUpdate},
{ "ScatterElementsUpdate", ScatterElementsUpdate},
{ "ScatterNDUpdate", ScatterNDUpdate},
{ "Interpolate", Interpolate},
{ "ReduceAnd", ReduceAnd},
{ "ReduceL1", ReduceL1},
{ "ReduceL2", ReduceL2},
{ "ReduceLogSum", ReduceLogSum},
{ "ReduceLogSumExp", ReduceLogSumExp},
{ "ReduceMax", ReduceMax},
{ "ReduceMean", ReduceMean},
{ "ReduceMin", ReduceMin},
{ "ReduceOr", ReduceOr},
{ "ReduceProd", ReduceProd},
{ "ReduceSum", ReduceSum},
{ "ReduceSumSquare", ReduceSumSquare},
{ "Erf", Eltwise },
{ "Roll", Roll },
{ "ReduceL1", Reduce},
{ "ReduceL2", Reduce},
{ "ReduceLogicalAnd", Reduce},
{ "ReduceLogicalOr", Reduce},
{ "ReduceMax", Reduce},
{ "ReduceMean", Reduce},
{ "ReduceMin", Reduce},
{ "ReduceProd", Reduce},
{ "ReduceSum", Reduce},
{ "ReduceLogSum", Reduce},
{ "ReduceLogSumExp", Reduce},
{ "ReduceSumSquare", Reduce},
{ "Broadcast", Broadcast},
{ "EmbeddingSegmentsSum", EmbeddingSegmentsSum},
{ "EmbeddingBagPackedSum", EmbeddingBagPackedSum},
{ "EmbeddingBagOffsetsSum", EmbeddingBagOffsetsSum},
{ "Gather", Gather},
{ "GatherElements", GatherElements},
{ "GatherND", GatherND},
{ "OneHot", OneHot},
{ "RegionYolo", RegionYolo},
{ "Select", Select}
};
Type TypeFromName(const std::string type) {
@ -162,44 +193,65 @@ MKLDNNNode::NodesFactory & MKLDNNNode::factory() {
return factoryInstance;
}
MKLDNNNode::MKLDNNNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng,
MKLDNNWeightsSharing::Ptr &w_cache)
MKLDNNNode::MKLDNNNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &w_cache)
: selectedPrimitiveDescriptorIndex(-1), permanent(false), temporary(false), constant(ConstantType::Unknown),
weightCache(w_cache), cnnLayer(layer), engine(eng), name(layer->name), typeStr(layer->type),
type(TypeFromName(layer->type)), profiling(layer->name) {
if (!layer->outData.empty()) {
for (const auto& outData : layer->outData) {
outDims.emplace_back(outData->getDims());
weightCache(w_cache), engine(eng), name(op->get_friendly_name()), typeStr(op->get_type_name()),
type(TypeFromName(op->get_type_name())), profiling(op->get_friendly_name()) {
algorithm = Algorithm::Undefined;
fusingPort = -1;
const std::string errorPrefix = "Ngraph operation " + std::string(op->get_type_name()) + " with name " + op->get_friendly_name();
for (size_t i = 0; i < op->get_input_size(); i++) {
if (op->get_input_partial_shape(i).is_dynamic())
IE_THROW() << errorPrefix << " has dynamic input shape on " << i << " port, but CPU plug-in supports only static shape";
}
for (size_t i = 0; i < op->get_output_size(); i++) {
if (op->get_output_partial_shape(i).is_dynamic())
IE_THROW() << errorPrefix << " has dynamic output shape on " << i << " port, but CPU plug-in supports only static shape";
}
for (size_t i = 0; i < op->get_input_size(); i++) {
const auto &shape = op->get_input_shape(i);
inDims.emplace_back(ngraph::is_scalar(shape) ? ngraph::Shape{1} : shape);
originalInputPrecisions.emplace_back(details::convertPrecision(op->get_input_element_type(i)));
}
if (typeStr != "Result" && typeStr != "Assign") {
if (op->get_output_size() == 0) {
IE_THROW() << "Node with type '" << typeStr << "' and name '" << name << "' does not have any outputs.";
}
} else {
if (!(CaselessEq<std::string>()(layer->type, "memory") ||
CaselessEq<std::string>()(layer->type, "memoryinput") ||
CaselessEq<std::string>()(layer->type, "output") ||
CaselessEq<std::string>()(layer->type, "reorder") ||
CaselessEq<std::string>()(layer->type, "convert"))) {
IE_THROW() << "Inappropriate layer type: " << layer->type << " name: " << layer->name;
for (size_t i = 0; i < op->get_output_size(); i++) {
const auto &shape = op->get_output_shape(i);
outDims.emplace_back(ngraph::is_scalar(shape) ? ngraph::Shape{1} : shape);
originalOutputPrecisions.emplace_back(details::convertPrecision(op->get_output_element_type(i)));
}
}
for (const auto& inData : layer->insData) {
inDims.emplace_back(inData.lock()->getDims());
const auto& rtInfo = op->get_rt_info();
if (rtInfo.count("originalLayersNames")) {
originalLayers = getRTInfoValue(rtInfo, "originalLayersNames");
}
if (layer->params.find("PrimitivesPriority") != layer->params.end()) {
std::istringstream stream(layer->params["PrimitivesPriority"]);
if (originalLayers.empty()) {
addOriginalLayer(name);
}
auto primitivesPriority = getPrimitivesPriorityValue(op);
if (!primitivesPriority.empty()) {
std::istringstream stream(primitivesPriority);
std::string str;
while (getline(stream, str, ',')) {
if (str.substr(0, 4) != "cpu:")
continue;
implPriorities.push_back(parse_impl_name(str));
if (implPriorities[implPriorities.size() - 1] == impl_desc_type::unknown &&
str != "cpu:unknown")
str != "cpu:unknown")
IE_THROW() << "Unsupported CPU implementation " << str << " for node " << getName();
}
}
auto ngraphNode = layer->getNode();
if (ngraphNode != nullptr) {
std::string inputMemoryFormats = ngraph::getMLKDNNInputMemoryFormats(ngraphNode);
if (op != nullptr) {
std::string inputMemoryFormats = ngraph::getMLKDNNInputMemoryFormats(op);
if (!inputMemoryFormats.empty()) {
std::istringstream stream(inputMemoryFormats);
std::string str;
@ -210,7 +262,7 @@ MKLDNNNode::MKLDNNNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::
}
}
std::string outputMemoryFormats = ngraph::getMLKDNNOutputMemoryFormats(ngraphNode);
std::string outputMemoryFormats = ngraph::getMLKDNNOutputMemoryFormats(op);
if (!outputMemoryFormats.empty()) {
std::istringstream stream(outputMemoryFormats);
std::string str;
@ -223,6 +275,13 @@ MKLDNNNode::MKLDNNNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::
}
}
MKLDNNNode::MKLDNNNode(const std::string& type, const std::string& name, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &w_cache)
: selectedPrimitiveDescriptorIndex(-1), permanent(false), temporary(false), constant(ConstantType::Unknown),
weightCache(w_cache), engine(eng), name(name), typeStr(type),
type(TypeFromName(type)), profiling(name) {
// TODO [NM]: What about filling inDims and outDims?
}
void MKLDNNNode::addEdge(const MKLDNNEdgeWeakPtr& edge) {
auto edgePtr = edge.lock();
if (!edgePtr)
@ -669,67 +728,6 @@ void MKLDNNNode::initDescriptor(const InferenceEngine::LayerConfig &config) {
selectedPD->getConfig() = rightConfig;
}
InferenceEngine::Blob::Ptr MKLDNNNode::createInternalBlob(InferenceEngine::SizeVector dims, bool weights, bool isGrouped) {
auto checkSize = [](size_t dst_size, size_t src_size) {
if (dst_size < src_size) {
IE_THROW() << "Cannot create internal buffer. Buffer can be overrun.";
}
};
auto * wLayer = dynamic_cast<InferenceEngine::WeightableLayer*>(getCnnLayer().get());
if (wLayer == nullptr)
IE_THROW() << "Cannot get weightable layer for node " << getName() << ".";
InferenceEngine::Blob::Ptr blb = weights ? wLayer->_weights : wLayer->_biases;
if (blb == nullptr)
IE_THROW() << "Cannot get internal blob layer for node " << getName() << ".";
auto intLayout = getWeightsLayoutByDims(dims, isGrouped);
InferenceEngine::TensorDesc desc(blb->getTensorDesc().getPrecision(), dims, intLayout);
auto fillInternalBlob = [&](char *data, size_t intBuffSize) {
size_t offset = blb->byteSize();
checkSize(intBuffSize, offset);
cpu_memcpy_s(data, intBuffSize, blb->buffer(), blb->byteSize());
data += blb->byteSize();
for (const auto &merged : getMergeWith()) {
wLayer = dynamic_cast<InferenceEngine::WeightableLayer*>(merged->getCnnLayer().get());
if (wLayer == nullptr)
IE_THROW() << "Cannot convert merged weightable layer for node "
<< getName() << ".";
blb = weights ? wLayer->_weights : wLayer->_biases;
if (blb == nullptr)
IE_THROW() << "Cannot get internal blob layer for node " << getName() << ".";
offset += blb->byteSize();
checkSize(intBuffSize, offset);
cpu_memcpy_s(data, intBuffSize, blb->buffer(), blb->byteSize());
data += blb->byteSize();
}
};
Blob::Ptr internalBlob;
if (blb->getTensorDesc().getPrecision() == Precision::BIN) {
internalBlob = InferenceEngine::make_shared_blob<int8_t>(desc);
} else if (blb->getTensorDesc().getPrecision() == Precision::I8) {
internalBlob = InferenceEngine::make_shared_blob<int8_t>(desc);
} else if (blb->getTensorDesc().getPrecision() == Precision::I32) {
internalBlob = InferenceEngine::make_shared_blob<int32_t>(desc);
} else if (blb->getTensorDesc().getPrecision() == Precision::BF16) {
internalBlob = InferenceEngine::make_shared_blob<int16_t>(desc);
} else {
internalBlob = InferenceEngine::make_shared_blob<float>(desc);
}
internalBlob->allocate();
char *data = internalBlob->buffer();
size_t intBuffSize = internalBlob->byteSize();
fillInternalBlob(data, intBuffSize);
return internalBlob;
}
void MKLDNNNode::prepareMemory(const PrimitiveDescInfo *selected_pd, mkldnn::primitive_desc_iterator& itpd) {
for (size_t i = 0; i < getChildEdges().size(); i++) {
auto &dstMemPtr = getChildEdgeAt(i)->getMemoryPtr();
@ -837,18 +835,17 @@ MKLDNNNode::ConstantType MKLDNNNode::checkConstant(LOOK look, std::vector<MKLDNN
return constant;
}
void MKLDNNNode::addOriginalLayer(const InferenceEngine::CNNLayerPtr &layer) {
if (!layer) return;
void MKLDNNNode::addOriginalLayer(const std::string& layerName) {
if (layerName.empty()) return;
if (originalLayers.empty()) {
originalLayers = layer->name;
originalLayers = layerName;
} else {
originalLayers += "," + layer->name;
originalLayers += "," + layerName;
}
}
void MKLDNNNode::cleanup() {
internalBlobs.clear();
cnnLayer.reset();
for (auto it : fusedWith) {
it->cleanup();
@ -1185,18 +1182,54 @@ InferenceEngine::Precision MKLDNNNode::getRuntimePrecision() const {
return runtimePrecision;
}
MKLDNNNode* MKLDNNNode::NodesFactory::create(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng,
MKLDNNNode* MKLDNNNode::NodesFactory::create(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng,
const MKLDNNExtensionManager::Ptr& extMgr, MKLDNNWeightsSharing::Ptr &w_cache) {
MKLDNNNode *newNode = nullptr;
std::unique_ptr<MKLDNNNode> ol(createNodeIfRegistered(MKLDNNPlugin, Generic, layer, eng, w_cache));
if (ol != nullptr && ol->created(extMgr))
newNode = ol.release();
if (newNode == nullptr) {
std::unique_ptr<MKLDNNNode> ol(createNodeIfRegistered(MKLDNNPlugin, TypeFromName(layer->type), layer, eng, w_cache));
std::string errorMessage;
try {
std::unique_ptr<MKLDNNNode> ol(createNodeIfRegistered(MKLDNNPlugin, Generic, op, eng, w_cache));
if (ol != nullptr && ol->created(extMgr))
newNode = ol.release();
} catch (const InferenceEngine::Exception& ex) {
IE_SUPPRESS_DEPRECATED_START
if (ex.getStatus() != NOT_IMPLEMENTED) {
throw;
} else {
errorMessage += getExceptionDescWithoutStatus(ex);
}
IE_SUPPRESS_DEPRECATED_END
}
if (newNode == nullptr) {
try {
std::unique_ptr<MKLDNNNode> ol(createNodeIfRegistered(MKLDNNPlugin, TypeFromName(op->get_type_name()), op, eng, w_cache));
if (ol != nullptr && ol->created(extMgr))
newNode = ol.release();
} catch (const InferenceEngine::Exception& ex) {
IE_SUPPRESS_DEPRECATED_START
if (ex.getStatus() != NOT_IMPLEMENTED) {
throw;
} else {
errorMessage += getExceptionDescWithoutStatus(ex);
}
IE_SUPPRESS_DEPRECATED_END
}
}
if (newNode == nullptr) {
try {
std::unique_ptr<MKLDNNNode> ol(new MKLDNNReferenceNode(op, eng, w_cache, errorMessage));
if (ol != nullptr && ol->created(extMgr))
newNode = ol.release();
} catch (const InferenceEngine::Exception& ex) {
IE_SUPPRESS_DEPRECATED_START
if (ex.getStatus() != NOT_IMPLEMENTED) {
throw;
} else {
errorMessage += getExceptionDescWithoutStatus(ex);
}
IE_SUPPRESS_DEPRECATED_END
}
}
// WA-start : TI node requires all attributes to construct internal subgpath
@ -1206,8 +1239,75 @@ MKLDNNNode* MKLDNNNode::NodesFactory::create(const InferenceEngine::CNNLayerPtr&
ti->setExtManager(extMgr);
// WA-end
if (!newNode)
IE_THROW() << "Unsupported primitive of type: " << layer->type << " name: " << layer->name;
if (!newNode) {
std::string errorDetails;
if (!errorMessage.empty()) {
errorDetails = "\nDetails: \n" + errorMessage;
}
IE_THROW() << "Unsupported operation of type: " << op->get_type_name() << " name: " << op->get_friendly_name() << errorDetails;
}
return newNode;
}
bool MKLDNNNode::canBePerformedAsScaleShift(const MKLDNNNode *parentNode) const {
size_t fusingPort = 0;
for (size_t i = (parentNode == nullptr ? 1 : 0); i < getParentEdges().size(); i++) {
MKLDNNNode *node = getParentEdgeAt(i)->getParent().get();
if (node == nullptr) {
IE_THROW() << "Cannot get parent node for " << getName() << " on " << i << " port";
}
if (node == parentNode) {
fusingPort = i;
continue;
}
if (!node->isConstant() || node->getType() != Input) {
return false;
}
}
const auto isBroadcastableToDataInput = [&]() {
const auto dataShape = getParentEdgeAt(fusingPort)->getDims().ToSizeVector();
for (size_t i = 0; i < getParentEdges().size(); i++) {
if (i == fusingPort)
continue;
auto weightShape = getParentEdgeAt(i)->getDims().ToSizeVector();
// [NM] TODO: PRelu is not broadcastable
// WA: [1,32,46,46], [32] -> [1,32,46,46], [1, 32, 1, 1]
if (getAlgorithm() == EltwisePrelu && weightShape.size() == 1 && weightShape.back() != 1) {
auto newWeightShape = std::vector<size_t>(dataShape.size(), 1);
newWeightShape[1] = weightShape[0];
weightShape = newWeightShape;
}
if (!isPerTensorOrPerChannelBroadcastable(dataShape, weightShape))
return false;
}
return true;
};
const auto isConvertablePowerStatic = [&]() {
if (getAlgorithm() == EltwisePowerStatic) {
const auto eltwise = dynamic_cast<const MKLDNNEltwiseNode *>(this);
if (!eltwise) {
IE_THROW() << "Cannot cast " << getName() << " to MKLDNNEltwiseNode";
}
return eltwise->getAlpha() == 1.0f;
}
return false;
};
return (one_of(getAlgorithm(), EltwiseAdd, EltwiseMultiply, EltwiseSubtract, EltwiseDivide, EltwisePrelu, EltwiseMulAdd) && isBroadcastableToDataInput())
|| isConvertablePowerStatic();
}
bool MKLDNNNode::canFuseSimpleOperation(const MKLDNNNodePtr& node) const {
if (node->getType() == FakeQuantize) {
return node->getAlgorithm() != FQBinarization;
} else if (node->getType() == Eltwise) {
return one_of(node->getAlgorithm(), EltwiseRelu, EltwiseGelu, EltwiseElu, EltwiseSigmoid, EltwiseClamp, EltwiseTanh,
EltwiseSwish, EltwiseHswish, EltwiseMish, EltwiseHsigmoid, EltwiseRoundHalfToEven,
EltwiseRoundHalfAwayFromZero, EltwiseAbs, EltwiseSqrt, EltwiseSoftRelu) ||
node->canBePerformedAsScaleShift(this);
}
return false;
}

View File

@ -11,7 +11,6 @@
#include <cassert>
#include <algorithm>
#include <caseless.hpp>
#include <ie_common.h>
#include "mkldnn_dims.h"
#include "mkldnn_memory.h"
#include "mkldnn_edge.h"
@ -23,13 +22,19 @@
#include "mkldnn_weights_cache.hpp"
#include "mkldnn.hpp"
#include <openvino/itt.hpp>
#include "utils/ngraph_utils.hpp"
#include <ngraph/ops.hpp>
#include <ngraph/node.hpp>
#include <ie_precision.hpp>
#include <nodes/common/tensor_desc_creator.h>
#include "cpu_types.h"
namespace MKLDNNPlugin {
using MKLDNNNodePtr = std::shared_ptr<MKLDNNNode>;
using MKLDNNNodeWeakPtr = std::weak_ptr<MKLDNNNode>;
// TODO [NM]: move into separate header
enum Type {
Unknown,
Generic,
@ -38,57 +43,54 @@ enum Type {
Output,
Convolution,
Deconvolution,
Activation,
Depthwise,
Lrn,
Pooling,
FullyConnected,
SoftMax,
Softmax,
Split,
Concatenation,
Eltwise,
Gemm,
MatMul,
Reshape,
Tile,
SimplerNMS,
ROIAlign,
ROIPooling,
BatchNormalization,
PSROIPooling,
BatchToSpace,
DepthToSpace,
Flatten,
Pad,
Permute,
Transpose,
SpaceToBatch,
SpaceToDepth,
StridedSlice,
Copy,
MemoryOutput,
MemoryInput,
RNNCell,
RNNSeq,
Quantize,
FakeQuantize,
BinaryConvolution,
DeformableConvolution,
TensorIterator,
Convert,
MVN,
Normalize,
NormalizeL2,
ScatterUpdate,
ScatterElementsUpdate,
ScatterNDUpdate,
Interpolate,
ReduceAnd,
ReduceL1,
ReduceL2,
ReduceLogSum,
ReduceLogSumExp,
ReduceMax,
ReduceMean,
ReduceMin,
ReduceOr,
ReduceProd,
ReduceSum,
ReduceSumSquare,
Roll
Reduce,
Broadcast,
EmbeddingSegmentsSum,
EmbeddingBagPackedSum,
EmbeddingBagOffsetsSum,
Gather,
GatherElements,
GatherND,
OneHot,
RegionYolo,
Select,
Roll,
Reference,
};
Type TypeFromName(const std::string type);
@ -107,50 +109,44 @@ static std::string NameFromType(Type type) {
return "Convolution";
case Deconvolution:
return "Deconvolution";
case Activation:
return "Activation";
case Lrn:
return "Lrn";
case Pooling:
return "Pooling";
case FullyConnected:
return "FullyConnected";
case Gemm:
return "Gemm";
case SoftMax:
return "SoftMax";
case MatMul:
return "MatMul";
case Softmax:
return "Softmax";
case Split:
return "Split";
case Concatenation:
return "Concatenation";
case Depthwise:
return "Depthwise";
case StridedSlice:
return "StridedSlice";
case Reshape:
return "Reshape";
case Tile:
return "Tile";
case SimplerNMS:
return "SimplerNMS";
case ROIAlign:
return "ROIAlign";
case ROIPooling:
return "ROIPooling";
case BatchNormalization:
return "BatchNormalization";
case PSROIPooling:
return "PSROIPooling";
case DepthToSpace:
return "DepthToSpace";
case Flatten:
return "Flatten";
case BatchToSpace:
return "BatchToSpace";
case Pad:
return "Pad";
case Permute:
return "Permute";
case Transpose:
return "Transpose";
case SpaceToDepth:
return "SpaceToDepth";
case StridedSlice:
return "StridedSlice";
case Copy:
return "Copy";
case SpaceToBatch:
return "SpaceToBatch";
case MemoryOutput:
return "MemoryOutput";
case MemoryInput:
@ -161,8 +157,8 @@ static std::string NameFromType(Type type) {
return "RNNCell";
case Eltwise:
return "Eltwise";
case Quantize:
return "Quantize";
case FakeQuantize:
return "FakeQuantize";
case BinaryConvolution:
return "BinaryConvolution";
case DeformableConvolution:
@ -173,8 +169,8 @@ static std::string NameFromType(Type type) {
return "TensorIterator";
case Convert:
return "Convert";
case Normalize:
return "Normalize";
case NormalizeL2:
return "NormalizeL2";
case ScatterUpdate:
return "ScatterUpdate";
case ScatterElementsUpdate:
@ -183,30 +179,28 @@ static std::string NameFromType(Type type) {
return "ScatterNDUpdate";
case Interpolate:
return "Interpolate";
case ReduceAnd:
return "ReduceAnd";
case ReduceL1:
return "ReduceL1";
case ReduceL2:
return "ReduceL2";
case ReduceLogSum:
return "ReduceLogSum";
case ReduceLogSumExp:
return "ReduceLogSumExp";
case ReduceMax:
return "ReduceMax";
case ReduceMean:
return "ReduceMean";
case ReduceMin:
return "ReduceMin";
case ReduceOr:
return "ReduceOr";
case ReduceProd:
return "ReduceProd";
case ReduceSum:
return "ReduceSum";
case ReduceSumSquare:
return "ReduceSumSquare";
case Reduce:
return "Reduce";
case Broadcast:
return "Broadcast";
case EmbeddingSegmentsSum:
return "EmbeddingSegmentsSum";
case EmbeddingBagPackedSum:
return "EmbeddingBagPackedSum";
case EmbeddingBagOffsetsSum:
return "EmbeddingBagPackedSum";
case Gather:
return "Gather";
case GatherElements:
return "GatherElements";
case GatherND:
return "GatherND";
case OneHot:
return "OneHot";
case RegionYolo:
return "RegionYolo";
case Select:
return "Select";
case Roll:
return "Roll";
default:
@ -269,6 +263,31 @@ private:
std::vector<mkldnn::memory::format_tag> outputLayouts;
};
class DataConfigurator {
public:
DataConfigurator(MKLDNNPlugin::TensorDescCreatorTypes tensorDescType, InferenceEngine::Precision prc, const InferenceEngine::SizeVector& shape,
bool constant = false, int inplace = -1) :
tensorDescCreator(getTensorDescCreator(tensorDescType)), prc(prc), shape(shape), constant(constant), inplace(inplace) {}
DataConfigurator(MKLDNNPlugin::TensorDescCreatorTypes tensorDescType, InferenceEngine::Precision prc = InferenceEngine::Precision::UNSPECIFIED,
bool constant = false, int inplace = -1) :
tensorDescCreator(getTensorDescCreator(tensorDescType)), prc(prc), shape({}), constant(constant), inplace(inplace) {}
const MKLDNNPlugin::TensorDescCreator::CreatorConstPtr tensorDescCreator;
const InferenceEngine::Precision prc = InferenceEngine::Precision::UNSPECIFIED;
const InferenceEngine::SizeVector shape;
const bool constant = false;
const int inplace = -1;
private:
static MKLDNNPlugin::TensorDescCreator::CreatorConstPtr getTensorDescCreator(MKLDNNPlugin::TensorDescCreatorTypes tensorDescType) {
auto& creators = MKLDNNPlugin::TensorDescCreator::getCommonCreators();
if (creators.find(tensorDescType) == creators.end()) {
IE_THROW() << "Cannot find tensor descriptor creator";
}
return creators.at(tensorDescType);
}
};
class MKLDNNNode : public InferenceEngine::details::no_copy {
public:
template<typename T, int N>
@ -343,8 +362,35 @@ public:
bool isFusedWith(Type type) const;
void fuseWith(const MKLDNNNodePtr &fuse) {
fusedWith.push_back(fuse);
void addFusedNode(const MKLDNNNodePtr &fusingNode) {
fusedWith.push_back(fusingNode);
}
virtual void fuseInto(MKLDNNNodePtr& parentNode) {
// The graph supports fusing only of consecutive nodes and some graph logic requires to know through which input port a node was fused into parent one.
for (int i = 0; i < getParentEdges().size(); i++) {
if (getParentEdgesAtPort(i)[0]->getParent().get() == parentNode.get()) {
setFusingPort(i);
break;
}
}
auto parentFusedNodes = parentNode->getFusedWith();
if (getFusingPort() < 0 && !parentFusedNodes.empty()) {
for (int i = 0; i < getParentEdges().size(); i++) {
if (getParentEdgesAtPort(i)[0]->getParent().get() == parentFusedNodes[parentFusedNodes.size() - 1].get()) {
setFusingPort(i);
break;
}
}
}
if (getFusingPort() == -1) {
IE_THROW() << "Cannot determine fusing port between nodes: " << parentNode->getName() << " and " << getName();
}
parentNode->addFusedNode(getParentEdgesAtPort(getFusingPort())[0]->getChild());
parentNode->addOriginalLayer(getOriginalLayers());
}
void clearFusedWith() {
@ -355,8 +401,6 @@ public:
mergedWith.push_back(merge);
}
void addOriginalLayer(const InferenceEngine::CNNLayerPtr &layer);
const std::vector <MKLDNNNodePtr> &getMergeWith() {
return mergedWith;
}
@ -365,10 +409,20 @@ public:
return fusedWith;
}
int getFusingPort() const {
return fusingPort;
}
void setFusingPort(int fusingPort) {
this->fusingPort = fusingPort;
}
const std::string getName() const {
return name;
}
void addOriginalLayer(const std::string& layerName);
const std::string getOriginalLayers() const {
return originalLayers;
}
@ -377,10 +431,6 @@ public:
return type;
}
const InferenceEngine::CNNLayerPtr &getCnnLayer() const {
return cnnLayer;
}
const std::vector<PrimitiveDescInfo>& getSupportedPrimitiveDescriptors() const {
return supportedPrimitiveDescriptors;
}
@ -493,15 +543,6 @@ public:
IE_THROW() << "Primitive descriptor was not found for node " << getName() << ".";
}
static void invertVectorCopyUtoI(const InferenceEngine::PropertyVector<unsigned int>& src, std::vector<ptrdiff_t>& dst) {
dst.clear();
for (int i = 1; i <= src.size(); i++) {
dst.push_back(static_cast<ptrdiff_t>(src[src.size() - i]));
}
}
std::vector<MKLDNNDims> inDims;
int getExecIndex() const {
return execIndex;
}
@ -510,6 +551,10 @@ public:
return typeStr;
}
void setTypeStr(const std::string &typeStr) {
this->typeStr = typeStr;
}
virtual size_t descInputNumbers(MKLDNNDescriptor desc) {
return desc.inputNumbers();
}
@ -532,9 +577,72 @@ public:
*/
virtual InferenceEngine::Precision getRuntimePrecision() const;
const std::vector<InferenceEngine::Precision>& getOriginalInputPrecisions() const {
return originalInputPrecisions;
}
const std::vector<InferenceEngine::Precision>& getOriginalOutputPrecisions() const {
return originalOutputPrecisions;
}
InferenceEngine::Precision getOriginalInputPrecisionAtPort(size_t port) const {
if (originalInputPrecisions.size() <= port) {
IE_THROW() << "Incorrect input port number for node " << getName();
}
return originalInputPrecisions[port];
}
InferenceEngine::Precision getOriginalOutputPrecisionAtPort(size_t port) const {
if (originalOutputPrecisions.size() <= port) {
IE_THROW() << "Incorrect output port number for node " << getName();
}
return originalOutputPrecisions[port];
}
void setOriginalInputPrecisionAtPort(size_t port, InferenceEngine::Precision precision) {
if (originalInputPrecisions.size() <= port) {
IE_THROW() << "Incorrect input port number for node " << getName();
}
originalInputPrecisions[port] = precision;
}
void setOriginalOutputPrecisionAtPort(size_t port, InferenceEngine::Precision precision) {
if (originalOutputPrecisions.size() <= port) {
IE_THROW() << "Incorrect output port number for node " << getName();
}
originalOutputPrecisions[port] = precision;
}
void addOriginalInputPrecision(InferenceEngine::Precision precision) {
originalInputPrecisions.push_back(precision);
}
void addOriginalOutputPrecision(InferenceEngine::Precision precision) {
originalOutputPrecisions.push_back(precision);
}
size_t getOriginalInputsNumber() const {
return originalInputPrecisions.size();
}
size_t getOriginalOutputsNumber() const {
return originalOutputPrecisions.size();
}
Algorithm getAlgorithm() const {
return algorithm;
}
void setAlgorithm(Algorithm alg) {
algorithm = alg;
}
virtual bool canFuse(const MKLDNNNodePtr& node) const {
return false;
}
protected:
// TODO: It is necessary only in order to avoid modifications of cnnLayers and original topology
std::vector<MKLDNNDims> outDims;
bool canBePerformedAsScaleShift(const MKLDNNNode *parentNode = nullptr) const;
bool canFuseSimpleOperation(const MKLDNNNodePtr& node) const;
void setType(Type type) {
this->type = type;
}
@ -559,6 +667,9 @@ protected:
GetPrimitiveMemoryFormatFunc;
std::vector<GetPrimitiveMemoryFormatFunc> internalBlobDesc;
std::vector<MKLDNNDims> inDims;
std::vector<MKLDNNDims> outDims;
std::vector <MKLDNNNodePtr> fusedWith;
std::vector <MKLDNNNodePtr> mergedWith;
std::vector <impl_desc_type> implPriorities;
@ -567,7 +678,8 @@ protected:
std::string originalLayers; // contains names of the original layers separated by comma
MKLDNNNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &w_cache);
MKLDNNNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &w_cache);
MKLDNNNode(const std::string& type, const std::string& name, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &w_cache);
int selectedPrimitiveDescriptorIndex = -1;
bool permanent = false;
@ -589,6 +701,8 @@ protected:
InferenceEngine::Blob::Ptr ext_scales;
MKLDNNWeightsSharing::Ptr weightCache;
Algorithm algorithm = Algorithm::Undefined;
friend class MKLDNNEdge;
friend class MKLDNNGraph;
friend class MKLDNNGraphOptimizer;
@ -604,8 +718,6 @@ protected:
virtual std::vector<mkldnn::memory::format_tag> getAvailableFormatsForDims(const MKLDNNDims& dims) const;
int batchToProcess();
InferenceEngine::Blob::Ptr createInternalBlob(InferenceEngine::SizeVector dims, bool weights, bool is_grouped = false);
InferenceEngine::Layout getWeightsLayoutByDims(InferenceEngine::SizeVector dims, bool isGrouped);
/**
@ -620,15 +732,62 @@ protected:
*/
virtual std::vector<InferenceEngine::Precision> getOutputPrecisions() const;
void addSupportedPrimDesc(const std::vector<DataConfigurator>& inDataConfigurators,
const std::vector<DataConfigurator>& outDataConfigurators,
impl_desc_type implType,
bool dynBatchSupport = false) {
auto fill_port = [] (const DataConfigurator& dataConfigurator, const InferenceEngine::SizeVector& dims,
InferenceEngine::Precision prc, std::vector<InferenceEngine::DataConfig>& port) -> bool {
// In order to simplify particular node initialization logic we just don't add config in case target shape is not supported by tensorDescCreator.
// This should be suitable for major of scenarios since almost all nodes add `ncsp` tensorDescCreator which supports any shape rank.
if (dims.size() < dataConfigurator.tensorDescCreator->getMinimalRank())
return false;
InferenceEngine::DataConfig dataConfig;
dataConfig.inPlace = dataConfigurator.inplace;
dataConfig.constant = dataConfigurator.constant;
dataConfig.desc = dataConfigurator.tensorDescCreator->createDesc(prc, dims);
port.push_back(dataConfig);
return true;
};
InferenceEngine::LayerConfig config;
for (size_t i = 0; i < inDataConfigurators.size(); i++) {
auto dims = inDataConfigurators[i].shape.empty() ? getParentEdgesAtPort(i)[0]->getDims().ToSizeVector() : inDataConfigurators[i].shape;
auto prc = inDataConfigurators[i].prc == InferenceEngine::Precision::UNSPECIFIED ? getOriginalInputPrecisionAtPort(i)
: inDataConfigurators[i].prc;
if (!fill_port(inDataConfigurators[i], dims, prc, config.inConfs))
return;
}
for (size_t i = 0; i < outDataConfigurators.size(); i++) {
auto dims = outDataConfigurators[i].shape.empty() ? getChildEdgesAtPort(i)[0]->getDims().ToSizeVector() : outDataConfigurators[i].shape;
auto prc = outDataConfigurators[i].prc == InferenceEngine::Precision::UNSPECIFIED ? getOriginalOutputPrecisionAtPort(i)
: outDataConfigurators[i].prc;
if (!fill_port(outDataConfigurators[i], dims, prc, config.outConfs))
return;
}
config.dynBatchSupport = dynBatchSupport;
supportedPrimitiveDescriptors.push_back({config, implType});
}
private:
std::vector<MKLDNNEdgeWeakPtr> parentEdges;
std::vector<MKLDNNEdgeWeakPtr> childEdges;
InferenceEngine::CNNLayerPtr cnnLayer;
std::vector<InferenceEngine::Precision> originalInputPrecisions;
std::vector<InferenceEngine::Precision> originalOutputPrecisions;
int fusingPort;
mkldnn::engine engine;
std::string name;
const std::string typeStr;
std::string typeStr;
Type type;
int execIndex = -1;
@ -660,21 +819,21 @@ private:
};
class MKLDNNNode::NodesFactory : public openvino::cc::Factory<Type,
MKLDNNNode*(const InferenceEngine::CNNLayerPtr&,
MKLDNNNode*(const std::shared_ptr<ngraph::Node>& op,
const mkldnn::engine &,
MKLDNNWeightsSharing::Ptr &)> {
public:
NodesFactory()
: Factory("NodesFactory") {}
MKLDNNNode* create(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng,
MKLDNNNode* create(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng,
const MKLDNNExtensionManager::Ptr& extMgr, MKLDNNWeightsSharing::Ptr &w_cache);
};
template<typename MKLDNNNodeType>
struct MKLDNNNodeImpl : public MKLDNNNodeType {
MKLDNNNodeImpl(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache)
: MKLDNNNodeType(layer, eng, cache) {
MKLDNNNodeImpl(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache)
: MKLDNNNodeType(op, eng, cache) {
MKLDNNNodeType::perfCounters().template buildClassCounters<MKLDNNNodeType>(NameFromType(MKLDNNNodeType::getType()));
}
};

View File

@ -8,7 +8,6 @@
#include "mkldnn_weights_cache.hpp"
#include "mkldnn_itt.h"
#include <legacy/net_pass.h>
#include <threading/ie_executor_manager.hpp>
#include <memory>
#include <ie_plugin_config.hpp>
@ -16,19 +15,8 @@
#include <tuple>
#include <ie_system_conf.h>
#include <nodes/list.hpp>
#include <legacy/ie_util_internal.hpp>
#include <legacy/graph_transformer.h>
#include <ie_ngraph_utils.hpp>
#include <legacy/convert_function_to_cnn_network.hpp>
#include <legacy/transformations/convert_opset1_to_legacy/convert_opset1_to_legacy.hpp>
#include <legacy/transformations/convert_opset1_to_legacy/convert_prior_to_ie_prior.hpp>
#include <legacy/transformations/convert_opset1_to_legacy/reshape_fully_connected.hpp>
#include <legacy/transformations/convert_opset1_to_legacy/convert_nms_5_to_legacy.hpp>
#include <legacy/transformations/convert_opset1_to_legacy/convert_interpolate_to_interp_or_resample.hpp>
#include <legacy/transformations/convert_opset1_to_legacy/convert_strided_slice_to_crop.hpp>
#include <legacy/ngraph_ops/fully_connected.hpp>
#include <transformations/opset_conversions/convert_opset3_to_opset2.hpp>
#include <transformations/opset_conversions/convert_opset2_to_opset1.hpp>
@ -62,6 +50,8 @@
#include <transformations/op_conversions/log_softmax_decomposition.hpp>
#include <transformations/op_conversions/convert_interpolate1_to_interpolate4.hpp>
#include <transformations/op_conversions/simplify_ctc_greedy_decoder_seq_len.hpp>
#include <transformations/op_conversions/convert_previous_nms_to_nms_5.hpp>
#include <transformations/op_conversions/convert_nms_to_nms_ie_internal.hpp>
#include <transformations/convert_precision.hpp>
#include <transformations/init_node_info.hpp>
#include <transformations/rt_info/fused_names_attribute.hpp>
@ -86,8 +76,13 @@
#include <low_precision/multiply_to_group_convolution.hpp>
#include <low_precision/network_helper.hpp>
#include <ie_algorithm.hpp>
#include <ngraph/pass/visualize_tree.hpp>
#include "nodes/mkldnn_mvn_node.h"
#include "nodes/mkldnn_quantize_node.h"
#include "nodes/mkldnn_fake_quantize_node.h"
#include "ngraph_transformations/convert_to_cpu_specific_opset.hpp"
#if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(_M_ARM64)
# ifdef _WIN32
@ -127,8 +122,6 @@ static void Transformation(CNNNetwork& clonedNetwork, const Config& conf) {
}
// WA: ConvertPriorBox must be executed before the 1st ConstantFolding pass
manager.register_pass<ngraph::pass::ConvertPriorBox>();
manager.register_pass<ngraph::pass::ConvertNMS5ToLegacyMatcher>();
manager.register_pass<ngraph::pass::CommonOptimizations>();
manager.register_pass<ngraph::pass::ConvertRNNSequenceToTensorIterator>();
manager.register_pass<ngraph::pass::ConvertGRUSequenceToTensorIterator>();
@ -141,6 +134,11 @@ static void Transformation(CNNNetwork& clonedNetwork, const Config& conf) {
manager.register_pass<ngraph::pass::LSTMCellDecomposition>();
manager.register_pass<ngraph::pass::GRUCellDecomposition>();
manager.register_pass<ngraph::pass::RNNCellDecomposition>();
manager.register_pass<ngraph::pass::ConvertNMS1ToNMS5>();
manager.register_pass<ngraph::pass::ConvertNMS3ToNMS5>();
manager.register_pass<ngraph::pass::ConvertNMS4ToNMS5>();
manager.register_pass<ngraph::pass::ConvertNMSToNMSIEInternal>();
manager.register_pass<ngraph::pass::ConstantFolding>();
std::vector<std::pair<ngraph::element::Type, ngraph::element::Type>> convert_precision_list{
{ngraph::element::i64, ngraph::element::i32},
@ -155,6 +153,10 @@ static void Transformation(CNNNetwork& clonedNetwork, const Config& conf) {
{ngraph::element::u4, ngraph::element::u8},
};
// In case BF16 is not supported by the target CPU we explicitly convert it to FP32
if (!with_cpu_x86_avx512_core())
convert_precision_list.push_back({ngraph::element::bf16, ngraph::element::f32});
for (auto &precision : convert_precision_list) {
manager.register_pass<ngraph::pass::ConvertPrecision>(precision.first, precision.second);
}
@ -171,12 +173,6 @@ static void Transformation(CNNNetwork& clonedNetwork, const Config& conf) {
node->input_value(0).get_shape().size() == node->get_output_shape(0).size();
});
// Disable FC reshaping for 3D case
pass_config->set_callback<ngraph::pass::ReshapeFullyConnected>(
[](const_node_ptr &node) -> bool {
return node->input_value(0).get_shape().size() == 3ul;
});
pass_config->set_callback<ngraph::pass::ConvertBatchToSpace,
ngraph::pass::ConvertSpaceToBatch>(
[](const_node_ptr &node) -> bool {
@ -260,7 +256,8 @@ static void Transformation(CNNNetwork& clonedNetwork, const Config& conf) {
pass_config->set_callback<ngraph::pass::MVN6Decomposition>(
[](const_node_ptr &node) -> bool {
return MKLDNNMVNNode::checkAxesSuitability(node);
std::string errorMessage;
return MKLDNNMVNNode::isSupportedOperation(node, errorMessage);
});
pass_config->set_callback<ngraph::pass::SoftmaxFusion>(
@ -279,7 +276,6 @@ static void Transformation(CNNNetwork& clonedNetwork, const Config& conf) {
pass_config->disable<ngraph::pass::HSigmoidDecomposition>();
pass_config->disable<ngraph::pass::ConvertMod>();
pass_config->disable<ngraph::pass::LogSoftmaxDecomposition>();
pass_config->disable<ngraph::pass::ConvertInterpolateToInterpOrResampleMatcher>();
pass_config->disable<ngraph::pass::WeightsDequantizeToFakeQuantize>();
pass_config->disable<ngraph::pass::SimplifyCTCGreedyDecoderSeqLen>();
@ -325,57 +321,35 @@ static void Transformation(CNNNetwork& clonedNetwork, const Config& conf) {
transformer.transform(nGraphFunc);
}
bool has_fake_quantize = ::ngraph::op::util::has_op_with_type<ngraph::op::FakeQuantize>(nGraphFunc);
ngraph::pass::Manager postLPTPassManager;
postLPTPassManager.register_pass<ngraph::pass::FakeQuantizeDecomposition>();
postLPTPassManager.register_pass<ngraph::pass::UnrollTensorIterator>();
ngraph::pass::Manager legacyManager;
legacyManager.register_pass<ngraph::pass::FakeQuantizeDecomposition>();
legacyManager.register_pass<ngraph::pass::ConvertOpSet1ToLegacy>();
legacyManager.register_pass<ngraph::pass::ConvertPrecision>(ngraph::element::i64, ngraph::element::i32);
// not legacy actually, but it should be the last transformation in the transformation pipeline
legacyManager.register_pass<ngraph::pass::UnrollTensorIterator>();
auto legacyPassConfig = legacyManager.get_pass_config();
legacyPassConfig->disable<ngraph::pass::ConvertStridedSliceToCropMatcher>();
legacyPassConfig->set_callback<ngraph::pass::FakeQuantizeDecomposition>([](const_node_ptr &node) -> bool {
return !MKLDNNQuantizeNode::isNeedToDecompose(node);
postLPTPassManager.get_pass_config()->set_callback<ngraph::pass::FakeQuantizeDecomposition>([](const_node_ptr &node) -> bool {
std::string errMsg;
return MKLDNNFakeQuantizeNode::isSupportedOperation(node, errMsg);
});
legacyPassConfig->set_callback<ngraph::pass::AddMultiplyFusion>([](const_node_ptr &node) -> bool {
postLPTPassManager.get_pass_config()->set_callback<ngraph::pass::AddMultiplyFusion>([](const_node_ptr &node) -> bool {
if (auto mul_op = std::dynamic_pointer_cast<const ngraph::opset1::Multiply>(node)) {
auto add_op = std::dynamic_pointer_cast<const ngraph::opset1::Add>(mul_op->get_input_node_shared_ptr(0));
auto constant = std::dynamic_pointer_cast<const ngraph::opset1::Constant>(mul_op->get_input_node_shared_ptr(1));
bool is_dequantization = mul_op->get_rt_info().count("DEQUANTIZATION") != 0;
if (add_op && constant && is_dequantization) {
return ngraph::is_type<ngraph::opset1::Convolution>(add_op->get_input_node_shared_ptr(0)) ||
ngraph::is_type<ngraph::opset1::GroupConvolution>(add_op->get_input_node_shared_ptr(0)) ||
ngraph::is_type<ngraph::opset1::MatMul>(add_op->get_input_node_shared_ptr(0));
ngraph::is_type<ngraph::opset1::GroupConvolution>(add_op->get_input_node_shared_ptr(0)) ||
ngraph::is_type<ngraph::opset1::MatMul>(add_op->get_input_node_shared_ptr(0));
}
}
return false;
});
legacyPassConfig->set_callback<ngraph::pass::UnrollTensorIterator>([](const_node_ptr &node) -> bool {
postLPTPassManager.get_pass_config()->set_callback<ngraph::pass::UnrollTensorIterator>([](const_node_ptr &node) -> bool {
// UnrollTI transformation is disabled by default, is turned on by LowLatency transformation
return node->get_rt_info().count("UNROLL_TI") == 0;
});
legacyManager.run_passes(nGraphFunc);
postLPTPassManager.run_passes(nGraphFunc);
OV_ITT_SCOPE_CHAIN(FIRST_INFERENCE, taskChain, MKLDNNPlugin::itt::domains::MKLDNN_LT, "Transformation", "convertFunctionToICNNNetwork");
clonedNetwork = CNNNetwork(InferenceEngine::details::convertFunctionToICNNNetwork(nGraphFunc, clonedNetwork, has_fake_quantize));
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "ConvertIOPrecision");
// WA: after conversion to CNNNetwork user precision can redefine input/output precisions
// so we need to apply additional precision conversion but only for inputs and outputs
for (auto & precision : convert_precision_list) {
NetPass::ConvertIOPrecision(clonedNetwork,
InferenceEngine::details::convertPrecision(precision.first),
InferenceEngine::details::convertPrecision(precision.second));
}
ConvertToCPUSpecificOpset(nGraphFunc);
}
InferenceEngine::ExecutableNetworkInternal::Ptr
@ -411,34 +385,9 @@ Engine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network, const std
conf.batchLimit = static_cast<int>(network.getBatchSize());
}
CNNNetwork clonedNetwork = InferenceEngine::cloneNetwork(network);
CNNNetwork clonedNetwork = InferenceEngine::details::cloneNetwork(network);
bool is_transformed = false;
if (clonedNetwork.getFunction()) {
Transformation(clonedNetwork, conf);
is_transformed = true;
}
IE_SUPPRESS_DEPRECATED_START
auto icnnnet = static_cast<ICNNNetwork::Ptr>(clonedNetwork);
IE_SUPPRESS_DEPRECATED_END
auto implNetwork = std::dynamic_pointer_cast<details::CNNNetworkImpl>(icnnnet);
if (implNetwork) {
OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::MKLDNN_LT, "CNNNet_based_ConstFolding");
// valid for CNNNetworkImpl only, while there's no API in ICNNNetwork to change network
ConstTransformer transformator(implNetwork.get());
transformator.fullTrim();
if (!is_transformed) {
InferenceEngine::CNNNetwork implNetworkWrapper(implNetwork);
NetPass::ConvertPrecision(implNetworkWrapper, Precision::I64, Precision::I32);
NetPass::ConvertPrecision(implNetworkWrapper, Precision::U64, Precision::I32);
NetPass::ConvertPrecision(implNetworkWrapper, Precision::U32, Precision::I32);
NetPass::ConvertPrecision(implNetworkWrapper, Precision::FP64, Precision::FP32);
NetPass::ConvertPrecision(implNetworkWrapper, Precision::FP16, Precision::FP32);
NetPass::ConvertPrecision(implNetworkWrapper, Precision::BOOL, Precision::U8);
NetPass::ConvertPrecision(implNetworkWrapper, Precision::U16, Precision::I32);
NetPass::ConvertPrecision(implNetworkWrapper, Precision::I16, Precision::I32);
}
}
Transformation(clonedNetwork, conf);
return std::make_shared<MKLDNNExecNetwork>(clonedNetwork, conf, extensionManager, weightsSharing);
}
@ -540,6 +489,7 @@ void Engine::AddExtension(InferenceEngine::IExtensionPtr extension) {
QueryNetworkResult Engine::QueryNetwork(const CNNNetwork& network, const std::map<std::string, std::string>& config) const {
QueryNetworkResult res;
MKLDNNWeightsSharing::Ptr fake_w_cache;
auto function = network.getFunction();
if (function != nullptr) {
@ -556,21 +506,22 @@ QueryNetworkResult Engine::QueryNetwork(const CNNNetwork& network, const std::ma
conf.batchLimit = static_cast<int>(network.getBatchSize());
}
auto clonedNetwork = InferenceEngine::cloneNetwork(network);
auto clonedNetwork = InferenceEngine::details::cloneNetwork(network);
auto ops = clonedNetwork.getFunction()->get_ordered_ops();
Transformation(clonedNetwork, conf);
std::unordered_set<std::string> supported;
std::unordered_set<std::string> unsupported;
for (details::CNNNetworkIterator itLayer{clonedNetwork}; itLayer != details::CNNNetworkIterator(); itLayer++) {
for (auto op : ops) {
auto layerIsSupported = [&] {
std::unique_ptr<MKLDNNNode> ptr;
try {
ptr.reset(MKLDNNNode::factory().create(*itLayer, {mkldnn::engine::kind::cpu, 0}, extensionManager, fake_w_cache));
ptr.reset(MKLDNNNode::factory().create(op, {mkldnn::engine::kind::cpu, 0}, extensionManager, fake_w_cache));
} catch (InferenceEngine::Exception&) {
return false;
return false;
}
return true;
} ();
for (auto&& fusedLayerName : ngraph::getFusedNamesVector((*itLayer)->getNode())) {
for (auto&& fusedLayerName : ngraph::getFusedNamesVector(op)) {
if (InferenceEngine::details::contains(originalOps, fusedLayerName)) {
if (layerIsSupported) {
supported.emplace(fusedLayerName);
@ -614,17 +565,7 @@ QueryNetworkResult Engine::QueryNetwork(const CNNNetwork& network, const std::ma
res.supportedLayersMap.emplace(layerName, GetName());
}
} else {
details::CNNNetworkIterator i(network);
while (i != details::CNNNetworkIterator()) {
try {
mkldnn::engine eng(mkldnn::engine(mkldnn::engine::kind::cpu, 0));
// if we can create and have not thrown exception, then layer is supported
std::unique_ptr <MKLDNNNode>(MKLDNNNode::factory().create(*i, eng, extensionManager, fake_w_cache));
res.supportedLayersMap.insert({ (*i)->name, GetName() });
} catch (InferenceEngine::Exception&) {
}
i++;
}
IE_THROW() << "CPU plug-in doesn't support not ngraph-based model!";
}
return res;

View File

@ -0,0 +1,98 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "convert_broadcast_to_tiles.hpp"
#include <ngraph/opsets/opset1.hpp>
#include <ngraph/rt_info.hpp>
#include <ngraph/pattern/op/wrap_type.hpp>
NGRAPH_RTTI_DEFINITION(MKLDNNPlugin::ConvertBroadcastToTiles, "ConvertBroadcastToTiles", 0);
MKLDNNPlugin::ConvertBroadcastToTiles::ConvertBroadcastToTiles() {
auto broadcast = ngraph::pattern::wrap_type<ngraph::opset1::Broadcast>();
ngraph::matcher_pass_callback callback = [this](ngraph::pattern::Matcher& m) {
auto broadcast = std::dynamic_pointer_cast<ngraph::opset1::Broadcast>(m.get_match_root());
if (!broadcast) {
return false;
}
auto data_node = broadcast->input_value(0);
if (data_node.get_partial_shape().is_dynamic()) {
return false;
}
auto shape_node = std::dynamic_pointer_cast<ngraph::opset1::Constant>(broadcast->input_value(1).get_node_shared_ptr());
auto axes_node = std::dynamic_pointer_cast<ngraph::opset1::Constant>(broadcast->input_value(2).get_node_shared_ptr());
if (!shape_node || !axes_node) return false;
auto output_shape = shape_node->cast_vector<int64_t>();
auto input_shape = data_node.get_shape();
int64_t cur_dim_id = output_shape.size() - 1;
size_t dims_count = output_shape.size();
auto last_node = data_node;
ngraph::NodeVector new_ops;
// In case if input_shape and output_shape differ we insert Reshape to align shapes
if (input_shape.size() != dims_count) {
if (input_shape.size() > dims_count) {
return false;
}
ngraph::Shape shape;
auto broadcast_type = broadcast->get_broadcast_spec();
if (broadcast_type == ngraph::op::AutoBroadcastType::NUMPY) {
shape = input_shape;
for (size_t i = 0; i < (dims_count - input_shape.size()); ++i) {
shape.insert(shape.begin(), 1);
}
} else if (broadcast_type == ngraph::op::AutoBroadcastType::NONE) {
auto axes = axes_node->cast_vector<int64_t>();
shape.assign(output_shape.size(), 1);
for (size_t i = 0; i < input_shape.size(); ++i) {
shape[axes[i]] = input_shape[i];
}
} else {
return false;
}
auto shape_const = std::make_shared<ngraph::opset1::Constant>(ngraph::element::i64, ngraph::Shape{shape.size()}, shape);
auto reshape = std::make_shared<ngraph::opset1::Reshape>(data_node, shape_const, true);
new_ops.push_back(reshape);
last_node = reshape;
input_shape = shape;
}
std::vector<int64_t> dims(dims_count, 1);
auto input_shape_it = input_shape.rbegin();
auto output_shape_it = output_shape.rbegin();
while (output_shape_it != output_shape.rend() && input_shape_it != input_shape.rend()) {
int64_t in_dim = *input_shape_it, out_dim = *output_shape_it;
if (in_dim != out_dim) {
if (in_dim != 1) {
return false;
}
dims[cur_dim_id] = out_dim;
}
--cur_dim_id;
++output_shape_it;
++input_shape_it;
}
auto const_node = std::make_shared<ngraph::opset1::Constant>(ngraph::element::i64, ngraph::Shape{dims_count}, dims);
auto tile = register_new_node<ngraph::opset1::Tile>(last_node, const_node);
new_ops.push_back(tile);
tile->set_friendly_name(broadcast->get_friendly_name());
ngraph::copy_runtime_info(broadcast, new_ops);
ngraph::replace_node(broadcast, tile);
return true;
};
auto m = std::make_shared<ngraph::pattern::Matcher>(broadcast, "ConvertBroadcastToTiles");
this->register_matcher(m, callback);
}

View File

@ -0,0 +1,17 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/pass/graph_rewrite.hpp>
namespace MKLDNNPlugin {
class ConvertBroadcastToTiles: public ngraph::pass::MatcherPass {
public:
NGRAPH_RTTI_DECLARATION;
ConvertBroadcastToTiles();
};
} // namespace MKLDNNPlugin

View File

@ -0,0 +1,251 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "convert_matmul_to_fc_or_gemm.hpp"
#include "op/fully_connected.hpp"
#include <numeric>
#include <ngraph/opsets/opset1.hpp>
#include <ngraph/rt_info.hpp>
#include <ngraph/pattern/op/wrap_type.hpp>
#include <transformations/utils/utils.hpp>
NGRAPH_RTTI_DEFINITION(MKLDNNPlugin::ConvertMatMulToFC, "ConvertMatMulToFC", 0);
MKLDNNPlugin::ConvertMatMulToFC::ConvertMatMulToFC() {
auto matmul = ngraph::pattern::wrap_type<ngraph::opset1::MatMul>({ngraph::pattern::any_input(ngraph::pattern::has_static_shape()),
ngraph::pattern::any_input(ngraph::pattern::has_static_shape())},
ngraph::pattern::has_static_shape());
ngraph::matcher_pass_callback callback = [this](ngraph::pattern::Matcher& m) {
auto matmul = std::dynamic_pointer_cast<ngraph::opset1::MatMul>(m.get_match_root());
if (!matmul) {
return false;
}
auto input_a = matmul->input(0).get_source_output();
auto input_b = matmul->input(1).get_source_output();
auto shape_a = input_a.get_shape();
auto shape_b = input_b.get_shape();
auto output_shape = matmul->get_shape();
// Transformation to FC is not supported for 1D second input
if (shape_b.size() == 1) {
return false;
}
/*
* get_aligned_shapes function align two input shapes to have the same size and
* the same batch dimensions (last two dimensions are not comparable).
* It also checks that dimensions are compatible so in case with two shapes
* for example: [2, 32, 64] [3, 64, 64] it will raise an exception.
*/
auto get_aligned_shapes = [shape_a, shape_b, &matmul]() -> std::pair<ngraph::Shape, ngraph::Shape> {
ngraph::Shape shape_a_aligned(shape_a), shape_b_aligned(shape_b);
size_t max_size = std::max(shape_a_aligned.size(), shape_b_aligned.size());
for (size_t i = 0, cnt = max_size - shape_a_aligned.size(); i < cnt; ++i)
shape_a_aligned.insert(shape_a_aligned.begin(), 1);
for (size_t i = 0, cnt = max_size - shape_b_aligned.size(); i < cnt; ++i)
shape_b_aligned.insert(shape_b_aligned.begin(), 1);
if (matmul->get_transpose_a() && shape_a.size() != 1) {
std::swap(*(shape_a_aligned.end() - 1), *(shape_a_aligned.end() - 2));
}
if (matmul->get_transpose_b()) {
std::swap(*(shape_b_aligned.end() - 1), *(shape_b_aligned.end() - 2));
}
for (size_t i = 0; i < max_size - 2; ++i) {
if (shape_a_aligned[i] != shape_b_aligned[i] && shape_a_aligned[i] > 1 && shape_b_aligned[i] > 1) {
std::ostringstream stream;
stream << "Shapes can't be aligned: " << shape_a_aligned << " " << shape_b_aligned;
throw ngraph::ngraph_error(stream.str());
}
size_t max_value = std::max(shape_a_aligned[i], shape_b_aligned[i]);
shape_a_aligned[i] = shape_b_aligned[i] = max_value;
}
return {shape_a_aligned, shape_b_aligned};
};
/*
* create_transpose function return Transpose operation to replace transpose_a or transpose_b
* arguments with an operation. In other words in this function we create Transpose operation
* with order length equal to output_shape length of given node and fill order with increasing
* sequence starting from 0 and replace last two dimension. For example for length = 4 the
* order will be [0, 1, 3, 2] that emulates transpose_a or transpose_b attribute.
*/
auto create_transpose = [this](ngraph::Output<ngraph::Node> node, const std::string& transpose_name) -> std::shared_ptr<ngraph::Node> {
ngraph::Shape output_shape = node.get_node_shared_ptr()->get_shape();
std::vector<size_t> transpose_order(output_shape.size());
std::iota(transpose_order.begin(), transpose_order.end(), 0);
std::swap(*(transpose_order.end() - 1), *(transpose_order.end() - 2));
auto transpose = ngraph::pass::MatcherPass::register_new_node<ngraph::opset1::Transpose>(
node, ngraph::opset1::Constant::create(ngraph::element::i64, ngraph::Shape{transpose_order.size()}, transpose_order));
transpose->set_friendly_name(transpose_name);
return transpose;
};
// fc_input_a and fc_input_b - are the final inputs that will be set to FullyConnected of GemmIE operations.
// So in case of adding new operations that takes matmul inputs we need keep update fc_input_a and
// fc_input_b updated.
auto fc_input_a = input_a, fc_input_b = input_b;
// vector of new nGraph operations
ngraph::NodeVector new_ops;
// Check that if second inputs is Constant operation and it's shape without ones dimensions has length <= 2
// we replace MatMul with FullyConnected operation.
// Otherwise we replace MatMul with Gemm.
if ((std::dynamic_pointer_cast<ngraph::opset1::Constant>(fc_input_b.get_node_shared_ptr()) ||
std::dynamic_pointer_cast<ngraph::opset1::FakeQuantize>(fc_input_b.get_node_shared_ptr())) &&
std::count_if(shape_b.begin(), shape_b.end(), [](size_t x) { return x != 1; }) <= 2) {
ngraph::Shape shape_a_aligned, shape_b_aligned;
std::tie(shape_a_aligned, shape_b_aligned) = get_aligned_shapes();
if (shape_a_aligned.size() < 2 || shape_b_aligned.size() < 2) {
throw ngraph::ngraph_error("MatMul " + matmul->get_friendly_name() + " shapes are inconsistent.");
}
// Transferring from MatMul representation: [B, I, K] * [B, K, O] = [B, I, O]
// to FullyConnected representation: [I, K] * [K, O] = [I, O]
size_t K = *(shape_a_aligned.end() - 1);
ngraph::Shape B(shape_a_aligned.begin(), shape_a_aligned.end() - 2);
// Weights normalization
if (!matmul->get_transpose_b()) {
fc_input_b = create_transpose(fc_input_b, matmul->get_friendly_name() + "/transpose_b");
new_ops.push_back(fc_input_b.get_node_shared_ptr());
}
if (shape_b.size() != 2) {
auto reshape_shape =
ngraph::opset1::Constant::create<int64_t>(ngraph::element::i64, ngraph::Shape{2}, {-1ll, static_cast<int64_t>(K)});
fc_input_b = std::make_shared<ngraph::opset1::Reshape>(fc_input_b, reshape_shape, true);
new_ops.push_back(fc_input_b.get_node_shared_ptr());
}
// Input normalization
if (matmul->get_transpose_a() && shape_a.size() != 1) {
fc_input_a = create_transpose(fc_input_a, matmul->get_friendly_name() + "/transpose_a");
new_ops.push_back(fc_input_a.get_node_shared_ptr());
}
// Create FullyConnected
auto fc = std::make_shared<MKLDNNPlugin::FullyConnectedNode>(fc_input_a, fc_input_b, output_shape, matmul->output(0).get_element_type());
fc->set_friendly_name(matmul->get_friendly_name());
new_ops.push_back(fc);
ngraph::copy_runtime_info(matmul, new_ops);
ngraph::replace_node(matmul, fc);
return true;
}
return false;
};
auto m = std::make_shared<ngraph::pattern::Matcher>(matmul, "ConvertMatMulToFC");
this->register_matcher(m, callback);
}
NGRAPH_RTTI_DEFINITION(MKLDNNPlugin::ConvertMatMulToGemm, "ConvertMatMulToGemm", 0);
MKLDNNPlugin::ConvertMatMulToGemm::ConvertMatMulToGemm() {
auto matmul = ngraph::pattern::wrap_type<ngraph::opset1::MatMul>({ngraph::pattern::any_input(ngraph::pattern::has_static_shape()),
ngraph::pattern::any_input(ngraph::pattern::has_static_shape())},
ngraph::pattern::has_static_shape());
ngraph::matcher_pass_callback callback = [this](ngraph::pattern::Matcher& m) {
auto matmul = std::dynamic_pointer_cast<ngraph::opset1::MatMul>(m.get_match_root());
if (!matmul) {
return false;
}
auto input_a = matmul->input(0).get_source_output();
auto input_b = matmul->input(1).get_source_output();
auto shape_a = input_a.get_shape();
auto shape_b = input_b.get_shape();
auto output_shape = matmul->get_shape();
auto fc_input_a = input_a, fc_input_b = input_b;
ngraph::NodeVector new_ops;
if (shape_a.size() == 1) {
// If the first input is 1D tensor, it is unsqueezed to 2D tensor (row vector)
// by adding axes with size 1 at ROW_INDEX_DIM, to the left of the shape.
// For example {S} will be reshaped to {1, S}.
fc_input_a = std::make_shared<ngraph::opset1::Unsqueeze>(fc_input_a,
ngraph::opset1::Constant::create(ngraph::element::i64, ngraph::Shape{1}, {0}));
shape_a = fc_input_a.get_shape();
new_ops.push_back(fc_input_a.get_node_shared_ptr());
// For 1D inputs transpose flag is expected to always act like `false`
matmul->set_transpose_a(false);
}
if (shape_b.size() == 1) {
// If the second input is 1D tensor, it is unsqueezed to 2D tensor (column vector)
// by adding axes with size 1 at COL_INDEX_DIM, to the right of the shape.
// For example {S} will be reshaped to {S, 1}.
fc_input_b = std::make_shared<ngraph::opset1::Unsqueeze>(fc_input_b,
ngraph::opset1::Constant::create(ngraph::element::i64, ngraph::Shape{1}, {1}));
shape_b = fc_input_b.get_shape();
new_ops.push_back(fc_input_b.get_node_shared_ptr());
// For 1D inputs transpose flag is expected to always act like `false`
matmul->set_transpose_b(false);
}
// WA for IE that Gemm must have inputs with the same length.
// If ranks of input arguments are still different,
// the smaller tensor is unsqueezed from the left side of the shape
// by necessary number of axes to make both shapes of the same rank.
if (shape_a.size() < shape_b.size()) {
// Reshape first input (fc_input_a)
ngraph::Shape reshape_shape(shape_b.size() - shape_a.size(), 1);
reshape_shape.insert(reshape_shape.end(), shape_a.begin(), shape_a.end());
fc_input_a = ngraph::op::util::reshapeTo(fc_input_a, reshape_shape);
new_ops.push_back(fc_input_a.get_node_shared_ptr());
} else if (shape_b.size() < shape_a.size()) {
// Reshape second input (fc_input_b)
ngraph::Shape reshape_shape(shape_a.size() - shape_b.size(), 1);
reshape_shape.insert(reshape_shape.end(), shape_b.begin(), shape_b.end());
fc_input_b = ngraph::op::util::reshapeTo(fc_input_b, reshape_shape);
new_ops.push_back(fc_input_b.get_node_shared_ptr());
}
auto gemm = matmul->copy_with_new_inputs({ fc_input_a, fc_input_b });
new_ops.push_back(gemm);
if (gemm->get_shape() != output_shape) {
// This case is possible when one of the inputs has exactly 1 dimension (that is not supported by GEMM operation)
// So to preserve output shape we insert additional reshape operation
std::shared_ptr<ngraph::Node> reshape_output;
if (output_shape.size() == 0) {
std::vector<int64_t> dim_indices(gemm->get_shape().size());
std::iota(dim_indices.begin(), dim_indices.end(), 0);
reshape_output = std::make_shared<ngraph::opset1::Squeeze>(gemm,
ngraph::opset1::Constant::create(ngraph::element::i64, ngraph::Shape{dim_indices.size()}, dim_indices));
} else {
reshape_output = ngraph::op::util::reshapeTo(gemm, output_shape);
}
new_ops.push_back(reshape_output);
gemm->set_friendly_name(matmul->get_friendly_name() + "/gemm");
reshape_output->set_friendly_name(matmul->get_friendly_name());
ngraph::copy_runtime_info(matmul, new_ops);
ngraph::replace_node(matmul, reshape_output);
} else {
gemm->set_friendly_name(matmul->get_friendly_name());
ngraph::copy_runtime_info(matmul, new_ops);
ngraph::replace_node(matmul, gemm);
}
return true;
};
auto m = std::make_shared<ngraph::pattern::Matcher>(matmul, "ConvertMatMulToGemm");
this->register_matcher(m, callback);
}

View File

@ -0,0 +1,23 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/pass/graph_rewrite.hpp>
namespace MKLDNNPlugin {
class ConvertMatMulToFC: public ngraph::pass::MatcherPass {
public:
NGRAPH_RTTI_DECLARATION;
ConvertMatMulToFC();
};
class ConvertMatMulToGemm: public ngraph::pass::MatcherPass {
public:
NGRAPH_RTTI_DECLARATION;
ConvertMatMulToGemm();
};
} // namespace MKLDNNPlugin

View File

@ -0,0 +1,95 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "convert_tile_to_seq_tiles.hpp"
#include <memory>
#include <vector>
#include <ngraph/opsets/opset1.hpp>
#include <ngraph/pattern/op/wrap_type.hpp>
#include <ngraph/rt_info.hpp>
NGRAPH_RTTI_DEFINITION(MKLDNNPlugin::ConvertTileToSeqTiles, "ConvertTileToSeqTiles", 0);
MKLDNNPlugin::ConvertTileToSeqTiles::ConvertTileToSeqTiles() {
auto tile = ngraph::pattern::wrap_type<ngraph::opset1::Tile>({ngraph::pattern::any_input(ngraph::pattern::has_static_rank()),
ngraph::pattern::wrap_type<ngraph::opset1::Constant>()});
ngraph::matcher_pass_callback callback = [](ngraph::pattern::Matcher& m) {
auto tile = std::dynamic_pointer_cast<ngraph::opset1::Tile>(m.get_match_root());
if (!tile) {
return false;
}
auto tiles_node = std::dynamic_pointer_cast<ngraph::opset1::Constant>(tile->input_value(1).get_node_shared_ptr());
if (!tiles_node) return false;
auto tiles = tiles_node->cast_vector<int64_t>();
auto input_shape_rank = static_cast<size_t>(tile->get_input_partial_shape(0).rank().get_length());
int64_t cur_dim_id = tiles.size() - 1;
if (static_cast<int64_t>(tiles.size()) != input_shape_rank) return false;
auto last_node = tile->input_value(0);
auto friendly_name = tile->get_friendly_name();
int num_of_tile_dims = 0;
for (auto t : tiles) {
if (t != 1) {
num_of_tile_dims++;
}
}
if (num_of_tile_dims == 0) {
auto outputs = tile->get_output_target_inputs(0);
for (const auto &out : outputs) {
if (std::dynamic_pointer_cast<ngraph::opset1::Result>(out.get_node()->shared_from_this())) {
return false;
}
}
ngraph::replace_node(tile, {last_node});
return true;
}
// Will generate sequence of Tile operations if num_of_tile_dims != 1
// because IE Tile operations supports only one axis to be tiled.
// To keep op name unique will use special IE specific delimiter ':'
// Original frameworks doesn't use such delimiter in names, so it will
// guarantee that newly generated name like "original_name:_1" doesn't
// match with already existed names.
if (num_of_tile_dims > 1) {
friendly_name += ":";
}
ngraph::NodeVector new_ops;
auto tiles_it = tiles.rbegin();
while (tiles_it != tiles.rend()) {
int64_t tile_dim = *tiles_it;
if (tile_dim != 1) {
std::vector<int64_t> dims(input_shape_rank, 1);
dims[cur_dim_id] = tile_dim;
auto const_node = std::make_shared<ngraph::opset1::Constant>(ngraph::element::i64, ngraph::Shape{input_shape_rank}, dims);
auto new_tile = std::make_shared<ngraph::opset1::Tile>(last_node, const_node);
new_tile->set_friendly_name(friendly_name);
friendly_name += "_" + std::to_string(cur_dim_id);
new_ops.push_back(new_tile);
last_node = new_tile;
}
--cur_dim_id;
++tiles_it;
}
last_node.get_node_shared_ptr()->set_friendly_name(tile->get_friendly_name());
ngraph::copy_runtime_info(tile, new_ops);
ngraph::replace_node(tile, {last_node});
return true;
};
auto m = std::make_shared<ngraph::pattern::Matcher>(tile, "ConvertTileToSeqTiles");
this->register_matcher(m, callback);
}

View File

@ -0,0 +1,17 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/pass/graph_rewrite.hpp>
namespace MKLDNNPlugin {
class ConvertTileToSeqTiles: public ngraph::pass::MatcherPass {
public:
NGRAPH_RTTI_DECLARATION;
ConvertTileToSeqTiles();
};
} // namespace MKLDNNPlugin

View File

@ -0,0 +1,49 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <ngraph/pass/constant_folding.hpp>
#include "convert_matmul_to_fc_or_gemm.hpp"
#include "fc_bias_fusion.hpp"
#include "reshape_fc_fusion.hpp"
#include "reshape_fully_connected.hpp"
#include "convert_broadcast_to_tiles.hpp"
#include "convert_tile_to_seq_tiles.hpp"
#include "reshape_1d_ops.hpp"
#include "convert_to_power_static.hpp"
#include "convert_to_leaky_relu.hpp"
#include "convert_to_swish_cpu.hpp"
#include "reshape_prelu.hpp"
#include "rnn_sequences_optimization.hpp"
namespace MKLDNNPlugin {
inline void ConvertToCPUSpecificOpset(std::shared_ptr<ngraph::Function> &nGraphFunc) {
ngraph::pass::Manager manager;
manager.register_pass<ngraph::pass::ConstantFolding>();
manager.register_pass<Reshape1DConvolution>();
manager.register_pass<Reshape1DGroupConvolution>();
manager.register_pass<Reshape1DAvgPool>();
manager.register_pass<Reshape1DMaxPool>();
manager.register_pass<ConvertBroadcastToTiles>();
manager.register_pass<ConvertTileToSeqTiles>();
manager.register_pass<ConvertMatMulToFC>();
manager.register_pass<ConvertMatMulToGemm>();
manager.register_pass<FullyConnectedBiasFusion>();
manager.register_pass<ReshapeFullyConnected>();
manager.register_pass<ConvertToPowerStatic>();
manager.register_pass<ConvertToLeakyRelu>();
manager.register_pass<ReshapePRelu>();
manager.register_pass<ConvertToSwishCPU>();
manager.register_pass<OptimizeGRUSequenceTransposes>();
manager.register_pass<OptimizeLSTMSequenceTransposes>();
manager.register_pass<OptimizeRNNSequenceTransposes>();
if (!ngraph::op::util::has_op_with_type<ngraph::op::FakeQuantize>(nGraphFunc)) {
manager.register_pass<ReshapeFullyConnectedFusion>();
}
manager.register_pass<ngraph::pass::ConstantFolding>();
manager.register_pass<ngraph::pass::ConvertPrecision>(ngraph::element::i64, ngraph::element::i32);
manager.run_passes(nGraphFunc);
}
} // namespace MKLDNNPlugin

View File

@ -0,0 +1,38 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "convert_to_leaky_relu.hpp"
#include <ngraph/opsets/opset1.hpp>
#include <ngraph/rt_info.hpp>
#include <ngraph/pattern/op/wrap_type.hpp>
#include "op/leaky_relu.hpp"
NGRAPH_RTTI_DEFINITION(MKLDNNPlugin::ConvertToLeakyRelu, "ConvertToLeakyRelu", 0);
MKLDNNPlugin::ConvertToLeakyRelu::ConvertToLeakyRelu() {
auto prelu = ngraph::pattern::wrap_type<ngraph::opset1::PRelu>({ngraph::pattern::any_input(ngraph::pattern::has_static_shape()),
ngraph::pattern::any_input(ngraph::pattern::has_static_shape())});
ngraph::matcher_pass_callback callback = [this](ngraph::pattern::Matcher& m) {
auto prelu = std::dynamic_pointer_cast<ngraph::opset1::PRelu>(m.get_match_root());
if (!prelu) {
return false;
}
auto slopeNode = std::dynamic_pointer_cast<ngraph::opset1::Constant>(prelu->get_input_node_shared_ptr(1));
if (slopeNode != nullptr && ngraph::shape_size(prelu->get_input_shape(1)) == 1) {
const float slope = slopeNode->cast_vector<float>()[0];
const auto leakyRelu = std::make_shared<MKLDNNPlugin::LeakyReluNode>(prelu->input(0).get_source_output(), slope,
prelu->output(0).get_element_type());
leakyRelu->set_friendly_name(prelu->get_friendly_name());
ngraph::copy_runtime_info(prelu, leakyRelu);
ngraph::replace_node(prelu, leakyRelu);
return true;
}
return false;
};
auto m = std::make_shared<ngraph::pattern::Matcher>(prelu, "ConvertToLeakyRelu");
this->register_matcher(m, callback);
}

View File

@ -0,0 +1,17 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/pass/graph_rewrite.hpp>
namespace MKLDNNPlugin {
class ConvertToLeakyRelu: public ngraph::pass::MatcherPass {
public:
NGRAPH_RTTI_DECLARATION;
ConvertToLeakyRelu();
};
} // namespace MKLDNNPlugin

View File

@ -0,0 +1,131 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "convert_to_power_static.hpp"
#include <ngraph/opsets/opset1.hpp>
#include <ngraph/opsets/opset4.hpp>
#include <ngraph/opsets/opset6.hpp>
#include <ngraph/rt_info.hpp>
#include <ngraph/pattern/op/wrap_type.hpp>
#include <ngraph/pattern/op/or.hpp>
#include "op/power_static.hpp"
#include "op/fully_connected.hpp"
#include "utils/general_utils.h"
int getConstPort(const std::shared_ptr<ngraph::Node> &node) {
const auto const1 = std::dynamic_pointer_cast<ngraph::opset1::Constant>(node->get_input_node_shared_ptr(0));
const auto const2 = std::dynamic_pointer_cast<ngraph::opset1::Constant>(node->get_input_node_shared_ptr(1));
int constPort = -1;
if (const2) {
constPort = 1;
} else if (const1) {
constPort = 0;
}
return constPort;
}
template <class BaseOp>
bool isConvertableToPowerStatic(const std::shared_ptr<BaseOp> &node) {
const int constPort = getConstPort(node);
if ((!node->get_input_element_type(0).is_real() && !node->get_input_element_type(1).is_real()) || !node->get_output_element_type(0).is_real() ||
constPort == -1) {
return false;
}
const int nonConstPort = 1 - constPort;
const auto constNode = std::dynamic_pointer_cast<ngraph::opset1::Constant>(node->get_input_node_shared_ptr(constPort));
return ngraph::shape_size(node->get_input_shape(constPort)) == 1 &&
node->get_input_shape(nonConstPort).size() >= node->get_input_shape(constPort).size() &&
!MKLDNNPlugin::one_of(node->get_input_node_shared_ptr(nonConstPort)->get_type_info(), ngraph::opset1::NormalizeL2::type_info,
ngraph::opset4::Interpolate::type_info,
ngraph::opset1::Convolution::type_info,
ngraph::opset1::GroupConvolution::type_info,
ngraph::opset1::ConvolutionBackpropData::type_info,
ngraph::opset1::GroupConvolutionBackpropData::type_info,
MKLDNNPlugin::FullyConnectedNode::type_info,
ngraph::op::v0::MVN::type_info,
ngraph::opset6::MVN::type_info);
}
template <>
bool isConvertableToPowerStatic(const std::shared_ptr<ngraph::opset1::Power> &node) {
return std::dynamic_pointer_cast<ngraph::opset1::Constant>(node->get_input_node_shared_ptr(1)) != nullptr &&
node->get_input_shape(0).size() >= node->get_input_shape(1).size() && ngraph::shape_size(node->get_input_shape(1)) == 1;
}
template <class BaseOp>
std::shared_ptr<ngraph::Node> convert(const std::shared_ptr<BaseOp> &node) {
const int constPort = getConstPort(node);
const int nonConstPort = 1 - constPort;
std::shared_ptr<ngraph::opset1::Constant> powerNode = std::dynamic_pointer_cast<ngraph::opset1::Constant>(node->get_input_node_shared_ptr(constPort));
const float value = powerNode->cast_vector<float>()[0];
if (std::is_same<BaseOp, ngraph::opset1::Power>::value) {
return std::make_shared<MKLDNNPlugin::PowerStaticNode>(node->input(nonConstPort).get_source_output(), value, 1.0f, 0.0f,
node->output(0).get_element_type());
} else if (std::is_same<BaseOp, ngraph::opset1::Add>::value) {
return std::make_shared<MKLDNNPlugin::PowerStaticNode>(node->input(nonConstPort).get_source_output(), 1.0f, 1.0f, value,
node->output(0).get_element_type());
} else if (std::is_same<BaseOp, ngraph::opset1::Subtract>::value) {
float scale = 1.0f;
float shift = value;
if (constPort == 0) {
scale *= -1.0f;
} else {
shift *= -1.0f;
}
return std::make_shared<MKLDNNPlugin::PowerStaticNode>(node->input(nonConstPort).get_source_output(), 1.0f, scale, shift,
node->output(0).get_element_type());
} else if (std::is_same<BaseOp, ngraph::opset1::Multiply>::value) {
return std::make_shared<MKLDNNPlugin::PowerStaticNode>(node->input(nonConstPort).get_source_output(), 1.f, value, 0.0f,
node->output(0).get_element_type());
} else {
throw ngraph::ngraph_error("ConvertToPowerStatic: op type is not supported");
}
}
NGRAPH_RTTI_DEFINITION(MKLDNNPlugin::ConvertToPowerStatic, "ConvertToPowerStatic", 0);
MKLDNNPlugin::ConvertToPowerStatic::ConvertToPowerStatic() {
ngraph::OutputVector twoInputs = {ngraph::pattern::any_input(ngraph::pattern::has_static_shape()),
ngraph::pattern::any_input(ngraph::pattern::has_static_shape())};
auto power = ngraph::pattern::wrap_type<ngraph::opset1::Power>(twoInputs);
auto add = ngraph::pattern::wrap_type<ngraph::opset1::Add>(twoInputs);
auto sub = ngraph::pattern::wrap_type<ngraph::opset1::Subtract>(twoInputs);
auto mult = ngraph::pattern::wrap_type<ngraph::opset1::Multiply>(twoInputs);
const auto candidate = std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{power, add, sub, mult});
ngraph::matcher_pass_callback callback = [this](ngraph::pattern::Matcher &m) {
auto node = m.get_match_root();
std::shared_ptr<ngraph::Node> toReplace = node;
if (auto power = std::dynamic_pointer_cast<ngraph::opset1::Power>(node)) {
if (!isConvertableToPowerStatic(power))
return false;
toReplace = convert(power);
} else if (auto add = std::dynamic_pointer_cast<ngraph::opset1::Add>(node)) {
if (!isConvertableToPowerStatic(add))
return false;
toReplace = convert(add);
} else if (auto sub = std::dynamic_pointer_cast<ngraph::opset1::Subtract>(node)) {
if (!isConvertableToPowerStatic(sub))
return false;
toReplace = convert(sub);
} else if (auto mult = std::dynamic_pointer_cast<ngraph::opset1::Multiply>(node)) {
if (!isConvertableToPowerStatic(mult))
return false;
toReplace = convert(mult);
} else {
throw ngraph::ngraph_error("ConvertToPowerStatic: op type is not supported");
}
toReplace->set_friendly_name(node->get_friendly_name());
ngraph::copy_runtime_info(node, toReplace);
ngraph::replace_node(node, toReplace);
return true;
};
auto m = std::make_shared<ngraph::pattern::Matcher>(candidate, "ConvertToPowerStatic");
this->register_matcher(m, callback);
}

View File

@ -0,0 +1,17 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/pass/graph_rewrite.hpp>
namespace MKLDNNPlugin {
class ConvertToPowerStatic: public ngraph::pass::MatcherPass {
public:
NGRAPH_RTTI_DECLARATION;
ConvertToPowerStatic();
};
} // namespace MKLDNNPlugin

View File

@ -0,0 +1,41 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "convert_to_swish_cpu.hpp"
#include <ngraph/opsets/opset4.hpp>
#include <ngraph/rt_info.hpp>
#include <ngraph/pattern/op/wrap_type.hpp>
#include "op/swish_cpu.hpp"
NGRAPH_RTTI_DEFINITION(MKLDNNPlugin::ConvertToSwishCPU, "ConvertToSwishCPU", 0);
MKLDNNPlugin::ConvertToSwishCPU::ConvertToSwishCPU() {
auto swish = ngraph::pattern::wrap_type<ngraph::opset4::Swish>();
ngraph::matcher_pass_callback callback = [](ngraph::pattern::Matcher& m) {
auto swish = std::dynamic_pointer_cast<ngraph::opset4::Swish> (m.get_match_root());
if (!swish) {
return false;
}
float beta_value = 1.0;
if (swish->input_values().size() == 2) {
auto beta = std::dynamic_pointer_cast<ngraph::opset4::Constant>(swish->get_input_node_shared_ptr(1));
if (!beta || ngraph::shape_size(swish->get_input_shape(1)) != 1) {
return false;
}
beta_value = beta->cast_vector<float>()[0];
}
auto swish_cpu = std::make_shared<MKLDNNPlugin::SwishNode>(swish->input(0).get_source_output(), beta_value);
swish_cpu->set_friendly_name(swish->get_friendly_name());
ngraph::copy_runtime_info(swish, swish_cpu);
ngraph::replace_node(swish, swish_cpu);
return true;
};
auto m = std::make_shared<ngraph::pattern::Matcher>(swish, "ConvertToSwishCPU");
this->register_matcher(m, callback);
}

View File

@ -0,0 +1,17 @@
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/pass/graph_rewrite.hpp>
namespace MKLDNNPlugin {
class ConvertToSwishCPU: public ngraph::pass::MatcherPass {
public:
NGRAPH_RTTI_DECLARATION;
ConvertToSwishCPU();
};
} // namespace MKLDNNPlugin

View File

@ -0,0 +1,70 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "fc_bias_fusion.hpp"
#include "op/fully_connected.hpp"
#include <numeric>
#include <ngraph/opsets/opset1.hpp>
#include <ngraph/rt_info.hpp>
#include <ngraph/pattern/op/wrap_type.hpp>
NGRAPH_RTTI_DEFINITION(MKLDNNPlugin::FullyConnectedBiasFusion, "FullyConnectedBiasFusion", 0);
MKLDNNPlugin::FullyConnectedBiasFusion::FullyConnectedBiasFusion() {
auto m_fc = ngraph::pattern::wrap_type<MKLDNNPlugin::FullyConnectedNode>([](ngraph::Output<ngraph::Node> output) {
return ngraph::pattern::consumers_count(1)(output) && ngraph::pattern::has_static_shape()(output);
});
auto m_bias = ngraph::pattern::any_input();
auto m_add = ngraph::pattern::wrap_type<ngraph::opset1::Add>({m_fc, m_bias});
ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) {
auto & pattern_to_output = m.get_pattern_value_map();
auto add = pattern_to_output[m_add].get_node_shared_ptr();
auto bias = pattern_to_output[m_bias].get_node_shared_ptr();
auto fc = std::dynamic_pointer_cast<MKLDNNPlugin::FullyConnectedNode>(pattern_to_output[m_fc].get_node_shared_ptr());
if (!fc) {
return false;
}
if (auto bcast = std::dynamic_pointer_cast<ngraph::opset1::Broadcast>(bias)) {
bias = bcast->input_value(0).get_node_shared_ptr();
}
if (!std::dynamic_pointer_cast<ngraph::opset1::Constant>(bias)) {
return false;
}
ngraph::Shape bias_shape(bias->get_shape());
ngraph::Shape output_shape(fc->get_shape());
size_t bias_size = std::accumulate(bias_shape.begin(), bias_shape.end(), size_t{1}, std::multiplies<int64_t>());
if (bias_shape.empty() || bias_shape.back() != output_shape.back() || bias_shape.back() != bias_size) {
return false;
}
ngraph::NodeVector new_ops;
std::shared_ptr<ngraph::Node> final_bias = bias;
if (bias->get_shape().size() >= 2) {
final_bias = std::make_shared<ngraph::opset1::Reshape>(final_bias, ngraph::opset1::Constant::create(ngraph::element::i64,
ngraph::Shape{1}, {-1}), true);
new_ops.push_back(final_bias);
}
auto new_fc = std::make_shared<MKLDNNPlugin::FullyConnectedNode>(fc->input(0).get_source_output(),
fc->input(1).get_source_output(),
final_bias,
fc->get_shape(),
fc->get_output_type());
new_ops.push_back(new_fc);
new_fc->set_friendly_name(add->get_friendly_name());
ngraph::copy_runtime_info({fc, add}, new_ops);
ngraph::replace_node(add, new_fc);
return true;
};
auto m = std::make_shared<ngraph::pattern::Matcher>(m_add, "FullyConnectedBiasFusion");
this->register_matcher(m, callback);
}

View File

@ -0,0 +1,17 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/pass/graph_rewrite.hpp>
namespace MKLDNNPlugin {
class FullyConnectedBiasFusion : public ngraph::pass::MatcherPass {
public:
NGRAPH_RTTI_DECLARATION;
FullyConnectedBiasFusion();
};
} // namespace MKLDNNPlugin

View File

@ -0,0 +1,45 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "fully_connected.hpp"
constexpr ngraph::NodeTypeInfo MKLDNNPlugin::FullyConnectedNode::type_info;
MKLDNNPlugin::FullyConnectedNode::FullyConnectedNode(const ngraph::Output<Node>& A,
const ngraph::Output<Node>& B,
const ngraph::Shape& output_shape,
const ngraph::element::Type output_type)
: Op({A, B}), m_output_shape(output_shape), m_output_type(output_type) {
constructor_validate_and_infer_types();
}
MKLDNNPlugin::FullyConnectedNode::FullyConnectedNode(const ngraph::Output<Node>& A,
const ngraph::Output<Node>& B,
const ngraph::Output<Node>& C,
const ngraph::Shape& output_shape,
const ngraph::element::Type output_type)
: Op({A, B, C}), m_output_shape(output_shape), m_output_type(output_type) {
constructor_validate_and_infer_types();
}
std::shared_ptr<ngraph::Node> MKLDNNPlugin::FullyConnectedNode::clone_with_new_inputs(const ngraph::OutputVector& new_args) const {
check_new_args_count(this, new_args);
if (new_args.size() == 2) {
return std::make_shared<MKLDNNPlugin::FullyConnectedNode>(new_args.at(0), new_args.at(1), m_output_shape);
} else if (new_args.size() == 3) {
return std::make_shared<MKLDNNPlugin::FullyConnectedNode>(new_args.at(0), new_args.at(1), new_args.at(2), m_output_shape);
}
throw ngraph::ngraph_error("Unsupported number of arguments for FullyConnected operation");
}
void MKLDNNPlugin::FullyConnectedNode::validate_and_infer_types() {
m_output_size = m_output_shape.back();
set_output_type(0, m_output_type == ngraph::element::undefined ? input_value(0).get_element_type() : m_output_type, m_output_shape);
}
bool MKLDNNPlugin::FullyConnectedNode::visit_attributes(ngraph::AttributeVisitor &visitor) {
visitor.on_attribute("out-size", m_output_size);
return true;
}

View File

@ -0,0 +1,47 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/node.hpp>
#include <ngraph/op/op.hpp>
#include <ngraph/op/util/fused_op.hpp>
namespace MKLDNNPlugin {
class FullyConnectedNode : public ngraph::op::Op {
public:
static constexpr ngraph::NodeTypeInfo type_info{"FullyConnected", 0};
const ngraph::NodeTypeInfo& get_type_info() const override { return type_info; }
FullyConnectedNode() = default;
FullyConnectedNode(const ngraph::Output<Node> &A,
const ngraph::Output<Node> &B,
const ngraph::Shape &output_shape,
const ngraph::element::Type output_type = ngraph::element::undefined);
FullyConnectedNode(const ngraph::Output<Node> &A,
const ngraph::Output<Node> &B,
const ngraph::Output<Node> &C,
const ngraph::Shape &output_shape,
const ngraph::element::Type output_type = ngraph::element::undefined);
bool visit_attributes(ngraph::AttributeVisitor &visitor) override;
void validate_and_infer_types() override;
std::shared_ptr<Node> clone_with_new_inputs(const ngraph::OutputVector& new_args) const override;
size_t get_out_size() const { return m_output_size; }
ngraph::element::Type get_output_type() const { return m_output_type; }
private:
size_t m_output_size = 0;
ngraph::Shape m_output_shape = {};
ngraph::element::Type m_output_type;
};
} // namespace MKLDNNPlugin

View File

@ -0,0 +1,31 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "leaky_relu.hpp"
constexpr ngraph::NodeTypeInfo MKLDNNPlugin::LeakyReluNode::type_info;
MKLDNNPlugin::LeakyReluNode::LeakyReluNode(const ngraph::Output<ngraph::Node> &data,
const float &negative_slope,
const ngraph::element::Type output_type)
: Op({data}), m_negative_slope(negative_slope), m_output_type(output_type) {
constructor_validate_and_infer_types();
}
std::shared_ptr<ngraph::Node> MKLDNNPlugin::LeakyReluNode::clone_with_new_inputs(const ngraph::OutputVector& new_args) const {
check_new_args_count(this, new_args);
return std::make_shared<MKLDNNPlugin::LeakyReluNode>(new_args.at(0), m_negative_slope, m_output_type);
}
void MKLDNNPlugin::LeakyReluNode::validate_and_infer_types() {
set_output_type(
0,
m_output_type == ngraph::element::undefined ? get_input_element_type(0) : m_output_type,
get_input_partial_shape(0));
}
bool MKLDNNPlugin::LeakyReluNode::visit_attributes(ngraph::AttributeVisitor &visitor) {
visitor.on_attribute("negative_slope", m_negative_slope);
return true;
}

View File

@ -0,0 +1,33 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/op/op.hpp>
namespace MKLDNNPlugin {
class LeakyReluNode : public ngraph::op::Op {
public:
static constexpr ngraph::NodeTypeInfo type_info{"LeakyRelu", 0};
const ngraph::NodeTypeInfo& get_type_info() const override { return type_info; }
LeakyReluNode(const ngraph::Output<ngraph::Node> &data, const float &negative_slope, const ngraph::element::Type output_type);
void validate_and_infer_types() override;
bool visit_attributes(ngraph::AttributeVisitor &visitor) override;
std::shared_ptr<ngraph::Node> clone_with_new_inputs(const ngraph::OutputVector &new_args) const override;
float get_slope() { return m_negative_slope; }
ngraph::element::Type get_output_type() const { return m_output_type; }
private:
float m_negative_slope;
ngraph::element::Type m_output_type;
};
} // namespace MKLDNNPlugin

View File

@ -0,0 +1,35 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "power_static.hpp"
constexpr ngraph::NodeTypeInfo MKLDNNPlugin::PowerStaticNode::type_info;
MKLDNNPlugin::PowerStaticNode::PowerStaticNode(const ngraph::Output<Node> &data,
const float &power,
const float &scale,
const float &shift,
const ngraph::element::Type output_type)
: Op({data}), scale(scale), power(power), shift(shift), m_output_type(output_type) {
constructor_validate_and_infer_types();
}
std::shared_ptr<ngraph::Node> MKLDNNPlugin::PowerStaticNode::clone_with_new_inputs(const ngraph::OutputVector &new_args) const {
if (new_args.size() != 1) {
throw ngraph::ngraph_error("Incorrect number of new arguments");
}
return std::make_shared<MKLDNNPlugin::PowerStaticNode>(new_args.at(0), this->power, this->scale, this->shift, this->m_output_type);
}
void MKLDNNPlugin::PowerStaticNode::validate_and_infer_types() {
set_output_type(0, m_output_type == ngraph::element::undefined ? get_input_element_type(0) : m_output_type, get_input_partial_shape(0));
}
bool MKLDNNPlugin::PowerStaticNode::visit_attributes(ngraph::AttributeVisitor &visitor) {
visitor.on_attribute("scale", scale);
visitor.on_attribute("power", power);
visitor.on_attribute("shift", shift);
return true;
}

View File

@ -0,0 +1,34 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/op/op.hpp>
namespace MKLDNNPlugin {
class PowerStaticNode : public ngraph::op::Op {
public:
static constexpr ngraph::NodeTypeInfo type_info{"PowerStatic", 0};
const ngraph::NodeTypeInfo& get_type_info() const override { return type_info; }
PowerStaticNode(const ngraph::Output<ngraph::Node> &data, const float &power, const float &scale, const float &shift,
const ngraph::element::Type output_type = ngraph::element::undefined);
void validate_and_infer_types() override;
bool visit_attributes(ngraph::AttributeVisitor &visitor) override;
std::shared_ptr<ngraph::Node> clone_with_new_inputs(const ngraph::OutputVector &new_args) const override;
float get_power() const { return power; }
float get_scale() const { return scale; }
float get_shift() const { return shift; }
private:
float scale, power, shift;
ngraph::element::Type m_output_type;
};
} // namespace MKLDNNPlugin

View File

@ -0,0 +1,31 @@
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "swish_cpu.hpp"
constexpr ngraph::NodeTypeInfo MKLDNNPlugin::SwishNode::type_info;
MKLDNNPlugin::SwishNode::SwishNode(const ngraph::Output<ngraph::Node> & input, const float alpha)
: Op({input}), m_alpha(alpha) {
constructor_validate_and_infer_types();
}
std::shared_ptr<ngraph::Node> MKLDNNPlugin::SwishNode::clone_with_new_inputs(const ngraph::OutputVector& new_args) const {
check_new_args_count(this, new_args);
return std::make_shared<MKLDNNPlugin::SwishNode>(new_args.at(0), m_alpha);
}
bool MKLDNNPlugin::SwishNode::visit_attributes(ngraph::AttributeVisitor& visitor) {
visitor.on_attribute("alpha", m_alpha);
return true;
}
void MKLDNNPlugin::SwishNode::validate_and_infer_types() {
set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
}
float MKLDNNPlugin::SwishNode::get_alpha() const {
return m_alpha;
}

View File

@ -0,0 +1,27 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/op/op.hpp>
namespace MKLDNNPlugin {
class SwishNode : public ngraph::op::Op {
public:
static constexpr ngraph::NodeTypeInfo type_info{"SwishCPU", 0};
const ngraph::NodeTypeInfo &get_type_info() const override { return type_info; }
explicit SwishNode(const ngraph::Output<Node> &input, float alpha = 1.0);
void validate_and_infer_types() override;
bool visit_attributes(ngraph::AttributeVisitor& visitor) override;
std::shared_ptr<ngraph::Node> clone_with_new_inputs(const ngraph::OutputVector &new_args) const override;
float get_alpha() const;
protected:
float m_alpha;
};
} // namespace MKLDNNPlugin

View File

@ -0,0 +1,175 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "reshape_1d_ops.hpp"
#include <memory>
#include <vector>
#include <ngraph/opsets/opset1.hpp>
#include <ngraph/rt_info.hpp>
#include <ngraph/pattern/op/wrap_type.hpp>
#include <ngraph_ops/type_relaxed.hpp>
#include "transformations/utils/utils.hpp"
template <class BaseOp>
std::shared_ptr<ngraph::Node> convert(const ngraph::Output<ngraph::Node> & data, std::shared_ptr<BaseOp> node, ngraph::NodeVector &new_ops) {
auto new_strides = node->get_strides();
auto new_dilations = node->get_dilations();
auto new_pads_begin = node->get_pads_begin();
auto new_pad_end = node->get_pads_end();
new_strides.insert(new_strides.begin(), 1);
new_dilations.insert(new_dilations.begin(), 1);
new_pads_begin.insert(new_pads_begin.begin(), 0);
new_pad_end.insert(new_pad_end.begin(), 0);
ngraph::Shape new_weights_shape(node->input_value(1).get_shape());
new_weights_shape.insert(new_weights_shape.begin() + new_weights_shape.size() - 1, 1);
auto weights = ngraph::op::util::reshapeTo(node->input_value(1), new_weights_shape);
new_ops.push_back(weights);
if (std::dynamic_pointer_cast<ngraph::op::TypeRelaxedBase>(node)) {
return std::make_shared<ngraph::op::TypeRelaxed<BaseOp>>(std::vector<ngraph::element::Type>{ngraph::element::f32, ngraph::element::f32},
std::vector<ngraph::element::Type>{ngraph::element::f32},
ngraph::op::TemporaryReplaceOutputType(data, ngraph::element::f32).get(),
ngraph::op::TemporaryReplaceOutputType(weights, ngraph::element::f32).get(),
new_strides,
new_pads_begin,
new_pad_end,
new_dilations,
node->get_auto_pad());
} else {
return std::make_shared<BaseOp>(data,
weights,
new_strides,
new_pads_begin,
new_pad_end,
new_dilations,
node->get_auto_pad());
}
}
template <>
std::shared_ptr<ngraph::Node> convert(const ngraph::Output<ngraph::Node> & data, std::shared_ptr<ngraph::opset1::MaxPool> node, ngraph::NodeVector & new_ops) {
auto new_strides = node->get_strides();
auto new_pads_begin = node->get_pads_begin();
auto new_pad_end = node->get_pads_end();
auto new_kernel = node->get_kernel();
new_strides.insert(new_strides.begin(), 1);
new_pads_begin.insert(new_pads_begin.begin(), 0);
new_pad_end.insert(new_pad_end.begin(), 0);
new_kernel.insert(new_kernel.begin(), 1);
return std::make_shared<ngraph::opset1::MaxPool>(data,
new_strides,
new_pads_begin,
new_pad_end,
new_kernel,
node->get_rounding_type(),
node->get_auto_pad());
}
template <>
std::shared_ptr<ngraph::Node> convert(const ngraph::Output<ngraph::Node> & data, std::shared_ptr<ngraph::opset1::AvgPool> node, ngraph::NodeVector & new_ops) {
// Update Pooling attributes with additional dimension
auto new_strides = node->get_strides();
auto new_pads_begin = node->get_pads_begin();
auto new_pad_end = node->get_pads_end();
auto new_kernel = node->get_kernel();
new_strides.insert(new_strides.begin(), 1);
new_pads_begin.insert(new_pads_begin.begin(), 0);
new_pad_end.insert(new_pad_end.begin(), 0);
new_kernel.insert(new_kernel.begin(), 1);
return std::make_shared<ngraph::opset1::AvgPool>(data,
new_strides,
new_pads_begin,
new_pad_end,
new_kernel,
node->get_exclude_pad(),
node->get_rounding_type(),
node->get_auto_pad());
}
ngraph::matcher_pass_callback get_callback() {
return [](ngraph::pattern::Matcher& m) {
auto node = m.get_match_root();
if (node->input(0).get_partial_shape().rank().get_length() != 3) {
return false;
}
// Insert H dimension equal to 1
auto input_shape = node->input(0).get_shape();
auto output_shape = node->output(0).get_shape();
input_shape.insert(input_shape.begin() + 2, 1);
ngraph::NodeVector new_ops;
// Reshape(input_shape)->Op->Reshape(output_shape)
ngraph::Output<ngraph::Node> last = ngraph::op::util::reshapeTo(node->input_value(0), input_shape);
last.get_node_shared_ptr()->set_friendly_name(node->get_friendly_name() + "/reshape_begin");
new_ops.push_back(last.get_node_shared_ptr());
if (auto conv = std::dynamic_pointer_cast<ngraph::opset1::Convolution>(node)) {
last = convert(last, conv, new_ops);
} else if (auto group_conv = std::dynamic_pointer_cast<ngraph::opset1::GroupConvolution>(node)) {
last = convert(last, group_conv, new_ops);
} else if (auto max_pool = std::dynamic_pointer_cast<ngraph::opset1::MaxPool>(node)) {
last = convert(last, max_pool, new_ops);
} else if (auto avg_pool = std::dynamic_pointer_cast<ngraph::opset1::AvgPool>(node)) {
last = convert(last, avg_pool, new_ops);
} else {
throw ngraph::ngraph_error("Reshape1DOps: op type is not supported");
}
last.get_node_shared_ptr()->set_friendly_name(node->get_friendly_name() + "/new");
new_ops.push_back(last.get_node_shared_ptr());
last = ngraph::op::util::reshapeTo(last, output_shape);
last.get_node_shared_ptr()->set_friendly_name(node->get_friendly_name());
new_ops.push_back(last.get_node_shared_ptr());
ngraph::copy_runtime_info(node, new_ops);
node->output(0).replace(last);
return true;
};
}
NGRAPH_RTTI_DEFINITION(MKLDNNPlugin::Reshape1DConvolution, "Reshape1DConvolution", 0);
MKLDNNPlugin::Reshape1DConvolution::Reshape1DConvolution() {
auto conv = ngraph::pattern::wrap_type<ngraph::opset1::Convolution>(ngraph::pattern::has_static_shape());
auto m = std::make_shared<ngraph::pattern::Matcher>(conv, "Reshape1DConvolution");
this->register_matcher(m, get_callback());
}
NGRAPH_RTTI_DEFINITION(MKLDNNPlugin::Reshape1DGroupConvolution, "Reshape1DGroupConvolution", 0);
MKLDNNPlugin::Reshape1DGroupConvolution::Reshape1DGroupConvolution() {
auto group_conv = ngraph::pattern::wrap_type<ngraph::opset1::GroupConvolution>(ngraph::pattern::has_static_shape());
auto m = std::make_shared<ngraph::pattern::Matcher>(group_conv, "Reshape1DGroupConvolution");
this->register_matcher(m, get_callback());
}
NGRAPH_RTTI_DEFINITION(MKLDNNPlugin::Reshape1DAvgPool, "Reshape1DAvgPool", 0);
MKLDNNPlugin::Reshape1DAvgPool::Reshape1DAvgPool() {
auto pool = ngraph::pattern::wrap_type<ngraph::opset1::AvgPool>(ngraph::pattern::has_static_shape());
auto m = std::make_shared<ngraph::pattern::Matcher>(pool, "Reshape1DAvgPool");
this->register_matcher(m, get_callback());
}
NGRAPH_RTTI_DEFINITION(MKLDNNPlugin::Reshape1DMaxPool, "Reshape1DMaxPool", 0);
MKLDNNPlugin::Reshape1DMaxPool::Reshape1DMaxPool() {
auto pool = ngraph::pattern::wrap_type<ngraph::opset1::MaxPool>(ngraph::pattern::has_static_shape());
auto m = std::make_shared<ngraph::pattern::Matcher>(pool, "Reshape1DMaxPool");
this->register_matcher(m, get_callback());
}

View File

@ -0,0 +1,35 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/pass/graph_rewrite.hpp>
namespace MKLDNNPlugin {
class Reshape1DConvolution: public ngraph::pass::MatcherPass {
public:
NGRAPH_RTTI_DECLARATION;
Reshape1DConvolution();
};
class Reshape1DGroupConvolution: public ngraph::pass::MatcherPass {
public:
NGRAPH_RTTI_DECLARATION;
Reshape1DGroupConvolution();
};
class Reshape1DAvgPool: public ngraph::pass::MatcherPass {
public:
NGRAPH_RTTI_DECLARATION;
Reshape1DAvgPool();
};
class Reshape1DMaxPool: public ngraph::pass::MatcherPass {
public:
NGRAPH_RTTI_DECLARATION;
Reshape1DMaxPool();
};
} // namespace MKLDNNPlugin

View File

@ -0,0 +1,80 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "reshape_fc_fusion.hpp"
#include "op/fully_connected.hpp"
#include <numeric>
#include <ngraph/opsets/opset1.hpp>
#include <ngraph/rt_info.hpp>
#include <ngraph/pattern/op/wrap_type.hpp>
#include <ngraph/pattern/op/or.hpp>
NGRAPH_RTTI_DEFINITION(MKLDNNPlugin::ReshapeFullyConnectedFusion, "ReshapeFullyConnectedFusion", 0);
MKLDNNPlugin::ReshapeFullyConnectedFusion::ReshapeFullyConnectedFusion() {
auto m_reshape = ngraph::pattern::wrap_type<ngraph::opset1::Reshape>(ngraph::pattern::has_static_shape());
ngraph::OutputVector twoInputs = {m_reshape, ngraph::pattern::any_input()};
ngraph::OutputVector threeInputs = {m_reshape, ngraph::pattern::any_input(), ngraph::pattern::any_input()};
auto fcTwoInputs = ngraph::pattern::wrap_type<MKLDNNPlugin::FullyConnectedNode>(twoInputs, ngraph::pattern::has_static_shape());
auto fcThreeInputs = ngraph::pattern::wrap_type<MKLDNNPlugin::FullyConnectedNode>(threeInputs, ngraph::pattern::has_static_shape());
const auto fcTwoOrThreeInputs = std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{fcTwoInputs, fcThreeInputs});
ngraph::matcher_pass_callback callback = [this](ngraph::pattern::Matcher &m) {
auto fc = std::dynamic_pointer_cast<MKLDNNPlugin::FullyConnectedNode>(m.get_match_root());
auto reshape = std::dynamic_pointer_cast<ngraph::opset1::Reshape>(fc->get_input_node_shared_ptr(0));
// Check that Reshape reshapes 4D tensor to 2D or input shape = output shape
auto shape_in = reshape->input_value(0).get_shape();
auto shape_out = reshape->get_shape();
if (!((shape_in.size() == 4 && reshape->get_shape().size() == 2) || (shape_in == shape_out && !shape_in.empty()))) {
return false;
}
// Check that Weights[O, C*H*W] consistent with Input[N, C, H, W]
auto shape_w = fc->input_value(1).get_shape();
if (shape_in[0] != shape_out[0] || std::accumulate(shape_in.begin() + 1, shape_in.end(), size_t{1}, std::multiplies<size_t>()) != shape_w[1]) {
return false;
}
ngraph::NodeVector new_ops;
auto weightInput = fc->input(1).get_source_output();
ngraph::Shape newWeightsShape;
const auto outShape = fc->get_shape();
if (shape_in.size() == 3) {
newWeightsShape = ngraph::Shape({outShape[2], shape_in[2]});
} else {
newWeightsShape.push_back(outShape[1]);
for (int i = 1; i < shape_in.size(); i++)
newWeightsShape.push_back(shape_in[i]);
}
if (newWeightsShape != weightInput.get_shape()) {
auto newShape = std::make_shared<ngraph::opset1::Constant>(ngraph::element::i64, ngraph::Shape{newWeightsShape.size()}, newWeightsShape);
weightInput = std::make_shared<ngraph::opset1::Reshape>(weightInput, newShape, true);
new_ops.push_back(weightInput.get_node_shared_ptr());
}
std::shared_ptr<ngraph::Node> new_fc;
if (fc->get_input_size() == 2) {
new_fc = std::make_shared<MKLDNNPlugin::FullyConnectedNode>(reshape->input_value(0),
weightInput,
outShape,
fc->output(0).get_element_type());
} else if (fc->get_input_size() == 3) {
new_fc = std::make_shared<MKLDNNPlugin::FullyConnectedNode>(reshape->input_value(0),
weightInput,
fc->input_value(2),
outShape,
fc->output(0).get_element_type());
}
new_ops.push_back(new_fc);
new_fc->set_friendly_name(fc->get_friendly_name());
ngraph::copy_runtime_info({reshape, fc}, new_ops);
ngraph::replace_node(fc, new_fc);
return true;
};
auto m = std::make_shared<ngraph::pattern::Matcher>(fcTwoOrThreeInputs, "ReshapeFullyConnectedFusion");
register_matcher(m, callback);
}

View File

@ -0,0 +1,17 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/pass/graph_rewrite.hpp>
namespace MKLDNNPlugin {
class ReshapeFullyConnectedFusion : public ngraph::pass::MatcherPass {
public:
NGRAPH_RTTI_DECLARATION;
ReshapeFullyConnectedFusion();
};
} // namespace MKLDNNPlugin

View File

@ -0,0 +1,84 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "reshape_fully_connected.hpp"
#include "op/fully_connected.hpp"
#include <ngraph/opsets/opset1.hpp>
#include <ngraph/rt_info.hpp>
#include <ngraph/pattern/op/wrap_type.hpp>
#include <transformations/utils/utils.hpp>
#include <ngraph/pattern/op/or.hpp>
NGRAPH_RTTI_DEFINITION(MKLDNNPlugin::ReshapeFullyConnected, "ReshapeFullyConnected", 0);
MKLDNNPlugin::ReshapeFullyConnected::ReshapeFullyConnected() {
ngraph::OutputVector twoInputs = {ngraph::pattern::any_input(ngraph::pattern::has_static_shape()), ngraph::pattern::any_input()};
ngraph::OutputVector threeInputs = {ngraph::pattern::any_input(ngraph::pattern::has_static_shape()), ngraph::pattern::any_input(),
ngraph::pattern::any_input()};
auto fcTwoInputs = ngraph::pattern::wrap_type<MKLDNNPlugin::FullyConnectedNode>(twoInputs, ngraph::pattern::has_static_shape());
auto fcThreeInputs = ngraph::pattern::wrap_type<MKLDNNPlugin::FullyConnectedNode>(threeInputs, ngraph::pattern::has_static_shape());
const auto fcTwoOrThreeInputs = std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{fcTwoInputs, fcThreeInputs});
ngraph::matcher_pass_callback callback = [this](ngraph::pattern::Matcher& m) {
auto fc = std::dynamic_pointer_cast<MKLDNNPlugin::FullyConnectedNode> (m.get_match_root());
if (!fc || transformation_callback(fc)) {
return false;
}
auto input_shape = fc->input_value(0).get_shape();
auto output_shape = fc->get_shape();
if (input_shape.size() == 2) {
return false;
}
ngraph::NodeVector new_ops;
std::vector<int64_t> reshape_shape{-1, static_cast<int64_t>(input_shape.back())};
auto reshape = std::make_shared<ngraph::opset1::Reshape>(fc->input_value(0),
ngraph::opset1::Constant::create(ngraph::element::i64, ngraph::Shape{2}, reshape_shape), true);
new_ops.push_back(reshape);
reshape->set_friendly_name(fc->get_friendly_name() + "/Reshape");
// Calculate output shape for new FullyConnected layer
// [I, K] * [O, K] = [I, O]
auto I = reshape->get_shape()[0];
auto O = fc->input_value(1).get_shape()[0];
ngraph::Shape output_shape_new{I, O};
std::shared_ptr<ngraph::Node> fc_new;
if (fc->get_input_size() == 2) {
fc_new = std::make_shared<MKLDNNPlugin::FullyConnectedNode>(reshape,
fc->input_value(1),
output_shape_new,
fc->get_output_type());
} else if (fc->get_input_size() == 3) {
fc_new = std::make_shared<MKLDNNPlugin::FullyConnectedNode>(reshape,
fc->input_value(1),
fc->input_value(2),
output_shape_new,
fc->get_output_type());
}
new_ops.push_back(fc_new);
if (output_shape != output_shape_new) {
auto reshape_output = ngraph::op::util::reshapeTo(fc_new, output_shape);
new_ops.push_back(reshape_output);
reshape_output->set_friendly_name(fc->get_friendly_name());
fc_new->set_friendly_name(fc->get_friendly_name() + "/FC");
ngraph::copy_runtime_info(fc, new_ops);
ngraph::replace_node(fc, reshape_output);
} else {
fc_new->set_friendly_name(fc->get_friendly_name());
ngraph::copy_runtime_info(fc, new_ops);
ngraph::replace_node(fc, fc_new);
}
return true;
};
auto m = std::make_shared<ngraph::pattern::Matcher>(fcTwoOrThreeInputs, "ReshapeFullyConnected");
this->register_matcher(m, callback);
}

View File

@ -0,0 +1,25 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/pass/graph_rewrite.hpp>
/*
* Description:
* ReshapeFullyConnected transformation detects FullyConnected operations
* and for each operation where input shape is greater than 2 inserts Reshape
* operations before and after FullyConnected operation. This transformation is
* required because of IE restrictions.
*/
namespace MKLDNNPlugin {
class ReshapeFullyConnected: public ngraph::pass::MatcherPass {
public:
NGRAPH_RTTI_DECLARATION;
ReshapeFullyConnected();
};
} // namespace MKLDNNPlugin

View File

@ -0,0 +1,35 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "reshape_prelu.hpp"
#include <ngraph/opsets/opset1.hpp>
#include <ngraph/rt_info.hpp>
#include <ngraph/pattern/op/wrap_type.hpp>
#include "transformations/utils/utils.hpp"
NGRAPH_RTTI_DEFINITION(MKLDNNPlugin::ReshapePRelu, "ReshapePRelu", 0);
MKLDNNPlugin::ReshapePRelu::ReshapePRelu() {
auto prelu = ngraph::pattern::wrap_type<ngraph::opset1::PRelu>({ngraph::pattern::any_input(ngraph::pattern::has_static_shape()),
ngraph::pattern::any_input(ngraph::pattern::has_static_shape())});
ngraph::matcher_pass_callback callback = [this](ngraph::pattern::Matcher& m) {
auto prelu = std::dynamic_pointer_cast<ngraph::opset1::PRelu>(m.get_match_root());
if (!prelu || ngraph::shape_size(prelu->get_input_shape(1)) == 1 || prelu->get_input_shape(1).size() != 1) {
return false;
}
ngraph::Shape new_shape(prelu->input_value(0).get_shape().size(), 1);
new_shape[new_shape.size() > 1 ? 1 : 0] = prelu->input_value(1).get_shape()[0];
auto slope = ngraph::op::util::reshapeTo(prelu->input_value(1), new_shape);
auto new_prelu = std::make_shared<ngraph::opset1::PRelu>(prelu->input(0).get_source_output(), slope);
new_prelu->set_friendly_name(prelu->get_friendly_name());
ngraph::copy_runtime_info(prelu, new_prelu);
ngraph::replace_node(prelu, new_prelu);
return true;
};
auto m = std::make_shared<ngraph::pattern::Matcher>(prelu, "ReshapePRelu");
this->register_matcher(m, callback);
}

View File

@ -0,0 +1,17 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/pass/graph_rewrite.hpp>
namespace MKLDNNPlugin {
class ReshapePRelu: public ngraph::pass::MatcherPass {
public:
NGRAPH_RTTI_DECLARATION;
ReshapePRelu();
};
} // namespace MKLDNNPlugin

View File

@ -0,0 +1,153 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "rnn_sequences_optimization.hpp"
#include <ngraph/pattern/op/wrap_type.hpp>
#include <ngraph/pattern/op/or.hpp>
#include <transformations/utils/utils.hpp>
#include <ngraph/variant.hpp>
NGRAPH_RTTI_DEFINITION(MKLDNNPlugin::OptimizeGRUSequenceTransposes, "OptimizeGRUSequenceTransposes", 0);
NGRAPH_RTTI_DEFINITION(MKLDNNPlugin::OptimizeLSTMSequenceTransposes, "OptimizeLSTMSequenceTransposes", 0);
NGRAPH_RTTI_DEFINITION(MKLDNNPlugin::OptimizeRNNSequenceTransposes, "OptimizeRNNSequenceTransposes", 0);
namespace {
int64_t getSeqAxis(const std::shared_ptr<ngraph::Node>& sequenceOp) {
// Optimization.
// Plug-ins support seqAxis attribute (value 1 or 0) for Seq ops, but according to the spec we don't
// support this attribute and should insert Transpose layer before and after Seq op in TI to Sequences
// transformation. Additional Transpose layers affect the performance, so we try to detect pattern
// Transpose(axis_order={1,0,2}) -> Seq -> Transpose(axis_order={2,1,0,3}
// and replace unnecessary Transpose ops with SeqIE (seqAxis = 0) to transfer value
// of the attribute to plug-ins.
// todo: specify seqAxis attribute for Sequence ops.
int64_t seqAxis = 1; // default
const auto& target_inputs = sequenceOp->output(0).get_target_inputs();
if (target_inputs.size() == 1) {
const auto& transpose_before = std::dynamic_pointer_cast<ngraph::op::v1::Transpose>(sequenceOp->input_value(0).get_node_shared_ptr());
const auto& transpose_after = std::dynamic_pointer_cast<ngraph::op::v1::Transpose>(target_inputs.begin()->get_node()->shared_from_this());
if (transpose_after != nullptr && transpose_before != nullptr) {
auto order_before = std::dynamic_pointer_cast<ngraph::op::v0::Constant>(
transpose_before->input_value(1).get_node_shared_ptr());
auto order_after = std::dynamic_pointer_cast<ngraph::op::v0::Constant>(
transpose_after->input_value(1).get_node_shared_ptr());
if (order_before != nullptr && order_after != nullptr) {
auto order_before_values = order_before->cast_vector<int64_t>();
auto order_after_values = order_after->cast_vector<int64_t>();
std::vector<int64_t> order_ref_before = {1, 0, 2};
std::vector<int64_t> order_ref_after = {2, 1, 0, 3};
if (order_before_values == order_ref_before && order_after_values == order_ref_after) {
seqAxis = 0;
}
}
}
}
return seqAxis;
}
bool transform(const std::shared_ptr<ngraph::Node>& sequenceOp) {
// Detect pattern: Transpose_before -> Seq -> Transpose_after
auto seqAxis = getSeqAxis(sequenceOp);
if (seqAxis == 0) {
ngraph::Output<ngraph::Node> in_0 = sequenceOp->get_input_source_output(0).get_node_shared_ptr()->get_input_source_output(0);
auto newInShape = ngraph::op::v0::Constant::create(ngraph::element::i32, ngraph::Shape{3}, sequenceOp->get_input_shape(0));
auto reshape1 = std::make_shared<ngraph::op::v1::Reshape>(in_0, newInShape, false);
ngraph::replace_node(sequenceOp->get_input_node_shared_ptr(0), {reshape1->output(0)});
const auto &gruTargetInputs = sequenceOp->output(0).get_target_inputs();
if (gruTargetInputs.empty())
return false;
auto transposeAfter = gruTargetInputs.begin()->get_node()->shared_from_this();
auto newOutShape = ngraph::op::v0::Constant::create(ngraph::element::i32, ngraph::Shape{4}, transposeAfter->get_output_shape(0));
auto reshape2 = std::make_shared<ngraph::op::v1::Reshape>(sequenceOp->output(0), newOutShape, false);
reshape2->set_friendly_name(transposeAfter->get_friendly_name());
ngraph::replace_node(transposeAfter, {reshape2->output(0)});
} else {
auto originShape = sequenceOp->get_output_shape(0);
const auto targetInputs = sequenceOp->get_output_target_inputs(0);
if (targetInputs.empty()) {
return false;
}
auto seqOut = targetInputs.begin()->get_node()->shared_from_this();
auto tncShape = ngraph::op::v0::Constant::create(ngraph::element::i32, ngraph::Shape{3}, {originShape[2], originShape[0], originShape[3]});
auto reshape1 = std::make_shared<ngraph::op::v1::Reshape>(sequenceOp->output(0), tncShape, false);
auto order = ngraph::op::v0::Constant::create(ngraph::element::i32, ngraph::Shape{3}, {1, 0, 2});
auto transpose = std::make_shared<ngraph::op::v1::Transpose>(reshape1->output(0), order);
auto ndtcShape = ngraph::op::v0::Constant::create(ngraph::element::i32, ngraph::Shape{4}, originShape);
auto reshape2 = std::make_shared<ngraph::op::v1::Reshape>(transpose->output(0), ndtcShape, false);
reshape2->set_friendly_name(sequenceOp->get_friendly_name()+".0");
ngraph::insert_new_node_between(sequenceOp, seqOut, reshape2);
}
sequenceOp->get_rt_info()["seqAxis"] = std::make_shared<ngraph::VariantWrapper<int64_t>>(seqAxis);
return true;
}
} // namespace
MKLDNNPlugin::OptimizeGRUSequenceTransposes::OptimizeGRUSequenceTransposes() {
ngraph::matcher_pass_callback callback = [](ngraph::pattern::Matcher &m) {
auto gruSequence = std::dynamic_pointer_cast<ngraph::op::v5::GRUSequence>(m.get_match_root());
if (!gruSequence) {
return false;
}
// Bidirectional cases are not supported
if (gruSequence->get_direction() == ngraph::op::RecurrentSequenceDirection::BIDIRECTIONAL)
return false;
return transform(gruSequence);
};
auto gruSequenceNgraph = ngraph::pattern::wrap_type<ngraph::op::v5::GRUSequence>();
auto m = std::make_shared<ngraph::pattern::Matcher>(gruSequenceNgraph, "OptimizeGRUSequenceTransposes");
this->register_matcher(m, callback);
}
MKLDNNPlugin::OptimizeRNNSequenceTransposes::OptimizeRNNSequenceTransposes() {
ngraph::matcher_pass_callback callback = [](ngraph::pattern::Matcher &m) {
auto rnnSequence = std::dynamic_pointer_cast<ngraph::op::v5::RNNSequence>(m.get_match_root());
if (!rnnSequence) {
return false;
}
// Bidirectional cases are not supported
if (rnnSequence->get_direction() == ngraph::op::RecurrentSequenceDirection::BIDIRECTIONAL)
return false;
return transform(rnnSequence);
};
auto rnnSequenceNgraph = ngraph::pattern::wrap_type<ngraph::op::v5::RNNSequence>();
auto m = std::make_shared<ngraph::pattern::Matcher>(rnnSequenceNgraph, "OptimizeRNNSequenceTransposes");
this->register_matcher(m, callback);
}
MKLDNNPlugin::OptimizeLSTMSequenceTransposes::OptimizeLSTMSequenceTransposes() {
ngraph::matcher_pass_callback callback = [](ngraph::pattern::Matcher &m) {
auto lstmSequence = std::dynamic_pointer_cast<ngraph::op::v5::LSTMSequence>(m.get_match_root());
if (!lstmSequence) {
return false;
}
// Bidirectional cases are not supported
if (lstmSequence->get_direction() == ngraph::op::RecurrentSequenceDirection::BIDIRECTIONAL)
return false;
return transform(lstmSequence);
};
auto lstmSequenceNgraph_0 = ngraph::pattern::wrap_type<ngraph::op::v0::LSTMSequence>();
auto lstmSequenceNgraph_5 = ngraph::pattern::wrap_type<ngraph::op::v5::LSTMSequence>();
const auto lstmSeqInputs = std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{lstmSequenceNgraph_0, lstmSequenceNgraph_5});
auto m = std::make_shared<ngraph::pattern::Matcher>(lstmSeqInputs, "OptimizeLSTMSequenceTransposes");
this->register_matcher(m, callback);
}

View File

@ -0,0 +1,29 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/pass/graph_rewrite.hpp>
namespace MKLDNNPlugin {
class OptimizeGRUSequenceTransposes : public ngraph::pass::MatcherPass {
public:
NGRAPH_RTTI_DECLARATION;
OptimizeGRUSequenceTransposes();
};
class OptimizeLSTMSequenceTransposes : public ngraph::pass::MatcherPass {
public:
NGRAPH_RTTI_DECLARATION;
OptimizeLSTMSequenceTransposes();
};
class OptimizeRNNSequenceTransposes : public ngraph::pass::MatcherPass {
public:
NGRAPH_RTTI_DECLARATION;
OptimizeRNNSequenceTransposes();
};
} // namespace MKLDNNPlugin

View File

@ -1,55 +0,0 @@
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "base.hpp"
#include "argmax_imp.hpp"
#include <string>
#include <vector>
namespace InferenceEngine {
namespace Extensions {
namespace Cpu {
class ArgMaxImpl: public ExtLayerBase {
public:
explicit ArgMaxImpl(const CNNLayer* layer) {
try {
if (layer->insData.size() != 1 || layer->outData.empty())
IE_THROW() << "Incorrect number of input/output edges!";
conf.out_max_val_ = layer->GetParamAsBool("out_max_val", false);
conf.top_k_ = layer->GetParamAsInt("top_k");
conf.has_axis_ = (layer->params.find("axis") != layer->params.end());
conf.axis_index_ = conf.has_axis_ ?
std::stoi(layer->params.at("axis")) :0;
addConfig(layer, {DataConfigurator(ConfLayout::PLN, Precision::FP32)}, {DataConfigurator(ConfLayout::PLN, Precision::FP32)});
} catch (InferenceEngine::Exception &ex) {
errorMsg = ex.what();
}
}
StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs,
ResponseDesc *resp) noexcept override {
SizeVector in_dims = inputs[0]->getTensorDesc().getDims();
float* src_data = inputs[0]->buffer();
float* dst_data = outputs[0]->buffer();
XARCH::arg_max_execute(src_data, dst_data, in_dims, conf);
return OK;
}
private:
argmax_conf conf;
};
REG_FACTORY_FOR(ArgMaxImpl, ArgMax);
} // namespace Cpu
} // namespace Extensions
} // namespace InferenceEngine

View File

@ -1,417 +0,0 @@
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "argmax_imp.hpp"
#include <cstring>
#include <algorithm>
#include <string>
#include <vector>
#include <cmath>
#include <utility>
#include <functional>
#include <ie_parallel.hpp>
#if defined(HAVE_SSE) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
#include <immintrin.h>
#include "nodes/common/uni_simd.h"
#endif
namespace InferenceEngine {
namespace Extensions {
namespace Cpu {
namespace XARCH {
using Shape = std::vector<size_t>;
#if defined(HAVE_AVX512F)
constexpr int count_vec = 32;
#elif defined(HAVE_SSE) || defined(HAVE_AVX2)
constexpr int count_vec = 16;
#endif
inline int count(Shape dims, size_t start_ind, size_t end_ind) {
size_t count = 1;
for (size_t i = start_ind; i < end_ind; i++)
count *= dims[i];
return static_cast<int>(count);
}
inline int count(Shape dims, size_t start_ind = 0) {
return count(dims, start_ind, dims.size());
}
template <bool out_max_val>
void argmax_one_class_has_axis(const float* src_data, float* dst_data, Shape in_dims, argmax_conf& conf) {
const auto axis_index_ = conf.axis_index_;
int axis_ = (axis_index_ < 0) ? axis_index_ + static_cast<int>(in_dims.size()) : axis_index_;
const int dim = static_cast<int>(in_dims[axis_]);
int before_num = count(in_dims, 0, axis_);
int after_num = count(in_dims, axis_ + 1, in_dims.size());
int first_index = 0;
#if defined(HAVE_AVX512F)
const int block_size = 16;
typedef __m512 vec_type_f;
typedef __m512i vec_type_i;
typedef __mmask16 vmask_type;
#elif defined(HAVE_AVX2)
const int block_size = 8;
typedef __m256 vec_type_f;
typedef __m256i vec_type_i;
typedef __m256 vmask_type;
#elif defined(HAVE_SSE)
const int block_size = 4;
typedef __m128 vec_type_f;
typedef __m128i vec_type_i;
typedef __m128 vmask_type;
#endif
#if defined(HAVE_SSE) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
parallel_for2d(before_num, after_num / block_size, [&](int i0, int ib1) {
int s_index = i0 * dim * after_num + ib1 * block_size;
vec_type_f vmax_val = _mm_uni_loadu_ps(src_data + s_index);
vec_type_i vindex_max_val = _mm_uni_setzero_si();
for (int i2 = 1; i2 < dim; i2++) {
s_index += after_num;
vec_type_f vsrc = _mm_uni_loadu_ps(src_data + s_index);
vmask_type vmask = _mm_uni_cmpgt_ps(vsrc, vmax_val);
vmax_val = _mm_uni_blendv_ps(vmax_val, vsrc, vmask);
if (!out_max_val) {
vec_type_i vindex_cur_val = _mm_uni_set1_epi32(i2);
#if defined(HAVE_AVX512F)
vindex_max_val = _mm512_mask_blend_epi32(vmask, vindex_max_val, vindex_cur_val);
#else
vindex_max_val = _mm_uni_blendv_epi8(vindex_max_val, vindex_cur_val, _mm_uni_castps_si(vmask));
#endif
}
}
if (!out_max_val) {
vec_type_f vindex_max_val_fp32 = _mm_uni_cvtepi32_ps(vindex_max_val);
_mm_uni_storeu_ps(dst_data + i0 * after_num + ib1 * block_size, vindex_max_val_fp32);
} else {
_mm_uni_storeu_ps(dst_data + i0 * after_num + ib1 * block_size, vmax_val);
}
});
first_index = after_num / block_size * block_size;
#endif
int rest = after_num - first_index;
parallel_for2d(before_num, rest, [&](int i0, int i1) {
int index_max_val = 0;
int s_index = i0 * dim * after_num + first_index + i1;
float max_val = src_data[s_index];
for (int i2 = 1; i2 < dim; i2++) {
s_index += after_num;
if (src_data[s_index] > max_val) {
max_val = src_data[s_index];
if (!out_max_val) {
index_max_val = i2;
}
}
}
if (!out_max_val)
dst_data[i0 * after_num + first_index + i1] = static_cast<float>(index_max_val);
else
dst_data[i0 * after_num + first_index + i1] = max_val;
});
}
template <bool out_max_val>
void argmax_one_class(const float* src_data, float* dst_data, Shape in_dims) {
const int dim = count(in_dims, 1);
int before_num = in_dims[0];
parallel_for(before_num, [&](int i0) {
int index_max_val = 0;
int s_index = i0 * dim;
float max_val = src_data[s_index];
for (int i1 = 1; i1 < dim; i1++) {
s_index++;
if (src_data[s_index] > max_val) {
max_val = src_data[s_index];
index_max_val = i1;
}
}
if (!out_max_val) {
dst_data[i0] = static_cast<float>(index_max_val);
} else {
dst_data[i0 * 2] = static_cast<float>(index_max_val);
dst_data[i0 * 2 + 1] = max_val;
}
});
}
template <bool out_max_val>
void argmax_many_classes_has_axis(const float* src_data, float* dst_data, Shape in_dims, argmax_conf& conf) {
const auto axis_index_ = conf.axis_index_;
const auto top_k_ = conf.top_k_;
int axis_ = (axis_index_ < 0) ? axis_index_ + static_cast<int>(in_dims.size()) : axis_index_;
const int dim = static_cast<int>(in_dims[axis_]);
int before_num = count(in_dims, 0, axis_);
int after_num = count(in_dims, axis_ + 1, in_dims.size());
int first_index = 0;
#if defined(HAVE_AVX512F)
const int block_size = 16;
typedef __m512 vec_type_f;
typedef __m512i vec_type_i;
typedef __mmask16 vmask_type;
#elif defined(HAVE_AVX2)
const int block_size = 8;
typedef __m256 vec_type_f;
typedef __m256i vec_type_i;
typedef __m256 vmask_type;
#elif defined(HAVE_SSE)
const int block_size = 4;
typedef __m128 vec_type_f;
typedef __m128i vec_type_i;
typedef __m128 vmask_type;
#endif
#if defined(HAVE_SSE) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
if (top_k_ < count_vec) {
parallel_for2d(before_num, after_num / block_size, [&](int i0, int ib1) {
#if defined(HAVE_AVX512F)
const int N = 32;
vec_type_f vmax_values[N];
vec_type_i vmax_indexes[N];
#else
const int N = 16;
vec_type_f vmax_values[N];
vec_type_i vmax_indexes[N];
#endif
vec_type_f vtmp;
vec_type_i vtmp_indexes;
vmask_type vmask;
int s_index = i0 * dim * after_num + ib1 * block_size;
auto vswap_func = [&](int index1, int index2) {
vtmp = vmax_values[index1];
vmax_values[index1] = _mm_uni_blendv_ps(vmax_values[index1], vmax_values[index2], vmask);
vmax_values[index2] = _mm_uni_blendv_ps(vmax_values[index2], vtmp, vmask);
if (!out_max_val) {
vtmp_indexes = vmax_indexes[index1];
#if defined(HAVE_AVX512F)
vmax_indexes[index1] = _mm512_mask_blend_epi32(vmask, vmax_indexes[index1], vmax_indexes[index2]);
vmax_indexes[index2] = _mm512_mask_blend_epi32(vmask, vmax_indexes[index2], vtmp_indexes);
#else
vmax_indexes[index1] = _mm_uni_blendv_epi8(vmax_indexes[index1], vmax_indexes[index2], _mm_uni_castps_si(vmask));
vmax_indexes[index2] = _mm_uni_blendv_epi8(vmax_indexes[index2], vtmp_indexes, _mm_uni_castps_si(vmask));
#endif
}
};
for (int i2 = 0; i2 < top_k_; i2++) {
vmax_values[i2] = _mm_uni_loadu_ps(src_data + s_index);
if (!out_max_val) {
vmax_indexes[i2] = _mm_uni_set1_epi32(i2);
}
s_index += after_num;
}
for (int i2 = 0; i2 < top_k_ - 1; i2++) {
for (int i3 = top_k_ - 1; i3 > i2; i3--) {
vmask = _mm_uni_cmpgt_ps(vmax_values[i3], vmax_values[i3 - 1]);
#if defined(HAVE_AVX512F)
if (vmask) {
vswap_func(i3, i3 - 1);
}
#else
int swap = _mm_uni_movemask_ps(vmask);
if (swap) {
vswap_func(i3, i3 - 1);
}
#endif
}
}
for (int i2 = top_k_; i2 < dim; i2++) {
vmax_values[top_k_] = _mm_uni_loadu_ps(src_data + s_index);
if (!out_max_val) {
vmax_indexes[top_k_] = _mm_uni_set1_epi32(i2);
}
for (int i3 = top_k_; i3 > 0; i3--) {
vmask = _mm_uni_cmpgt_ps(vmax_values[i3], vmax_values[i3 - 1]);
#if defined(HAVE_AVX512F)
if (vmask) {
vswap_func(i3, i3 - 1);
} else {
break;
}
#else
int swap = _mm_uni_movemask_ps(vmask);
if (swap) {
vswap_func(i3, i3 - 1);
} else {
break;
}
#endif
}
s_index += after_num;
}
for (int i2 = 0; i2 < top_k_; i2++) {
if (!out_max_val) {
_mm_uni_storeu_ps(dst_data + (i0 * top_k_ + i2) * after_num + ib1 * block_size,
_mm_uni_cvtepi32_ps(vmax_indexes[i2]));
} else {
_mm_uni_storeu_ps(dst_data + (i0 * top_k_ + i2) * after_num + ib1 * block_size, vmax_values[i2]);
}
}
});
first_index = after_num / block_size * block_size;
}
#endif
int rest = after_num - first_index;
parallel_for2d(before_num, rest, [&](int i0, int i1) {
std::vector<float> max_values(top_k_ + 1);
std::vector<int> max_indexes(top_k_ + 1);
float tmp_value;
int tmp_index;
int s_index = i0 * dim * after_num + first_index + i1;
auto swap_func = [&](int index1, int index2) {
tmp_value = max_values[index1];
max_values[index1] = max_values[index2];
max_values[index2] = tmp_value;
if (!out_max_val) {
tmp_index = max_indexes[index1];
max_indexes[index1] = max_indexes[index2];
max_indexes[index2] = tmp_index;
}
};
for (int i2 = 0; i2 < top_k_; i2++) {
max_values[i2] = src_data[s_index];
if (!out_max_val) {
max_indexes[i2] = i2;
}
s_index += after_num;
}
for (int i2 = 0; i2 < top_k_ - 1; i2++) {
for (int i3 = top_k_ - 1; i3 > i2; i3--) {
if (max_values[i3] > max_values[i3 - 1]) {
swap_func(i3, i3 - 1);
}
}
}
for (int i2 = top_k_; i2 < dim; i2++) {
max_values[top_k_] = src_data[s_index];
if (!out_max_val) {
max_indexes[top_k_] = i2;
}
for (int i3 = top_k_; i3 > 0; i3--) {
if (max_values[i3] > max_values[i3 - 1]) {
swap_func(i3, i3 - 1);
} else {
break;
}
}
s_index += after_num;
}
for (int i2 = 0; i2 < top_k_; i2++) {
if (!out_max_val) {
dst_data[i0 * top_k_ * after_num + i2 * after_num + first_index + i1] = static_cast<float>(max_indexes[i2]);
} else {
dst_data[i0 * top_k_ * after_num + i2 * after_num + first_index + i1] = max_values[i2];
}
}
});
}
template <bool out_max_val>
void argmax_many_classes(const float* src_data, float* dst_data, Shape in_dims, argmax_conf& conf) {
const int dim = count(in_dims, 1);
auto top_k_ = conf.top_k_;
int before_num = in_dims[0];
parallel_for(before_num, [&](int i0) {
std::vector<float> max_values(top_k_ + 1);
std::vector<int> max_indexes(top_k_ + 1);
float tmp_value;
int tmp_index;
int s_index = i0 * dim;
auto swap_func = [&](int index1, int index2) {
tmp_value = max_values[index1];
max_values[index1] = max_values[index2];
max_values[index2] = tmp_value;
tmp_index = max_indexes[index1];
max_indexes[index1] = max_indexes[index2];
max_indexes[index2] = tmp_index;
};
for (int i2 = 0; i2 < top_k_; i2++) {
max_values[i2] = src_data[s_index];
max_indexes[i2] = i2;
s_index++;
}
for (int i2 = 0; i2 < top_k_ - 1; i2++) {
for (int i3 = top_k_ - 1; i3 > i2; i3--) {
if (max_values[i3] > max_values[i3 - 1]) {
swap_func(i3, i3 - 1);
}
}
}
for (int i2 = top_k_; i2 < dim; i2++) {
max_values[top_k_] = src_data[s_index];
max_indexes[top_k_] = i2;
for (int i3 = top_k_; i3 > 0; i3--) {
if (max_values[i3] > max_values[i3 - 1]) {
swap_func(i3, i3 - 1);
} else {
break;
}
}
s_index++;
}
for (int i2 = 0; i2 < top_k_; i2++) {
if (!out_max_val) {
dst_data[i0 * top_k_ + i2] = static_cast<float>(max_indexes[i2]);
} else {
dst_data[i0 * 2 * top_k_ + i2] = static_cast<float>(max_indexes[i2]);
dst_data[i0 * 2 * top_k_ + top_k_ + i2] = max_values[i2];
}
}
});
}
void arg_max_execute(const float* input, float *output, std::vector<size_t> dims, argmax_conf& conf) {
Shape in_dims = dims;
const float* src_data = input;
float* dst_data = output;
auto top_k_ = conf.top_k_;
auto has_axis_ = conf.has_axis_;
auto out_max_val_ = conf.out_max_val_;
if (top_k_ == 1) {
if (has_axis_) {
if (out_max_val_) {
argmax_one_class_has_axis<true>(src_data, dst_data, in_dims, conf);
} else {
argmax_one_class_has_axis<false>(src_data, dst_data, in_dims, conf);
}
} else {
if (out_max_val_) {
argmax_one_class<true>(src_data, dst_data, in_dims);
} else {
argmax_one_class<false>(src_data, dst_data, in_dims);
}
}
} else {
if (has_axis_) {
if (out_max_val_) {
argmax_many_classes_has_axis<true>(src_data, dst_data, in_dims, conf);
} else {
argmax_many_classes_has_axis<false>(src_data, dst_data, in_dims, conf);
}
} else {
if (out_max_val_) {
argmax_many_classes<true>(src_data, dst_data, in_dims, conf);
} else {
argmax_many_classes<false>(src_data, dst_data, in_dims, conf);
}
}
}
}
} // namespace XARCH
} // namespace Cpu
} // namespace Extensions
} // namespace InferenceEngine

View File

@ -1,27 +0,0 @@
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <cstddef>
#include <vector>
namespace InferenceEngine {
namespace Extensions {
namespace Cpu {
struct argmax_conf {
bool out_max_val_;
int top_k_;
bool has_axis_;
int axis_index_;
};
namespace XARCH {
void arg_max_execute(const float* inputs, float *outputs, std::vector<size_t> dims, argmax_conf& conf);
} // namespace XARCH
} // namespace Cpu
} // namespace Extensions
} // namespace InferenceEngine

View File

@ -5,8 +5,11 @@
#pragma once
#include <ie_iextension.h>
#include <legacy/ie_util_internal.hpp>
#include "nodes/list.hpp"
#include "common/tensor_desc_creator.h"
#include "ngraph/descriptor/tensor.hpp"
#include <ie_ngraph_utils.hpp>
#include "cpu_types.h"
#include <string>
#include <vector>
@ -53,99 +56,76 @@ public:
}
protected:
enum class ConfLayout { ANY, PLN, BLK8, BLK16 };
MKLDNNPlugin::Algorithm getAlgorithm() const {
return algorithm;
}
MKLDNNPlugin::Algorithm algorithm;
class DataConfigurator {
public:
explicit DataConfigurator(ConfLayout l):
layout(l) {}
DataConfigurator(MKLDNNPlugin::TensorDescCreatorTypes tensorDescType, Precision prc = Precision::UNSPECIFIED, bool constant = false, int inplace = -1) :
tensorDescCreator(getTensorDescCreator(tensorDescType)), prc(prc), constant(constant), inplace(inplace) {}
DataConfigurator(ConfLayout l, bool constant, int inplace = -1, Precision::ePrecision prc = Precision::UNSPECIFIED):
layout(l), constant(constant), inplace(inplace), prc(prc) {}
DataConfigurator(const MKLDNNPlugin::TensorDescCreator::CreatorConstPtr& tensorDescCreator, Precision prc = Precision::UNSPECIFIED,
bool constant = false, int inplace = -1) : tensorDescCreator(tensorDescCreator), prc(prc), constant(constant), inplace(inplace) {}
DataConfigurator(ConfLayout l, Precision::ePrecision prc):
layout(l), prc(prc) {}
ConfLayout layout;
bool constant = false;
int inplace = -1;
Precision::ePrecision prc = Precision::UNSPECIFIED; // by default use the layer precision
const MKLDNNPlugin::TensorDescCreator::CreatorConstPtr tensorDescCreator;
const bool constant = false;
const int inplace = -1;
const Precision prc = Precision::UNSPECIFIED; // By default ngraph node precision is used
private:
static MKLDNNPlugin::TensorDescCreator::CreatorConstPtr getTensorDescCreator(MKLDNNPlugin::TensorDescCreatorTypes tensorDescType) {
auto& creators = MKLDNNPlugin::TensorDescCreator::getCommonCreators();
if (creators.find(tensorDescType) == creators.end()) {
IE_THROW() << "Cannot find tensor descriptor creator";
}
return creators.at(tensorDescType);
}
};
void addConfig(const CNNLayer* layer, std::vector<DataConfigurator> in_l,
std::vector<DataConfigurator> out_l, bool dynBatchSupport = false) {
void addConfig(const std::shared_ptr<ngraph::Node>& op,
const std::vector<DataConfigurator>& inDataConfigurators,
const std::vector<DataConfigurator>& outDataConfigurators,
bool dynBatchSupport = false) {
LayerConfig config;
if (in_l.size() != layer->insData.size())
IE_THROW() << "Incorrect number of input edges for layer " << layer->name << ". Expected " << layer->insData.size()
<< " but layout specification provided for " << in_l.size();
if (out_l.size() != layer->outData.size())
IE_THROW() << "Incorrect number of output edges for layer " << layer->name << ". Expected " << layer->outData.size()
<< " but layout specification provided for " << out_l.size();
if (inDataConfigurators.size() != op->get_input_size())
IE_THROW() << "Cannot add config for operation " << op->get_friendly_name() << ". Incorrect number of inputs: " <<
"expected: " << op->get_input_size() << ", provided: " << inDataConfigurators.size();
if (outDataConfigurators.size() != op->get_output_size())
IE_THROW() << "Cannot add config for operation " << op->get_friendly_name() << ". Incorrect number of outputs: " <<
"expected: " << op->get_output_size() << ", provided: " << outDataConfigurators.size();
// Fill tensor parameters into config
auto fill_port = [] (std::vector<DataConfig>& port, DataConfigurator conf, const DataPtr& data) {
auto div_up = [](const int a, const int b) -> int {
if (!b)
return 0;
return (a + b - 1) / b;
};
if (!data) IE_THROW() << "Cannot get input data!";
auto fill_port = [] (const DataConfigurator& dataConfigurator, const ngraph::descriptor::Tensor& tensor, std::vector<DataConfig>& port) -> bool {
// In order to simplify particular node initialization logic we just don't add config in case target shape is not supported by tensorDescCreator.
// This should be suitable for major of scenarios since almost all nodes add `ncsp` tensorDescCreator which supports any shape rank.
if (tensor.get_shape().size() < dataConfigurator.tensorDescCreator->getMinimalRank())
return false;
auto precision = dataConfigurator.prc != Precision::UNSPECIFIED ? dataConfigurator.prc : details::convertPrecision(tensor.get_element_type());
DataConfig dataConfig;
dataConfig.inPlace = conf.inplace;
dataConfig.constant = conf.constant;
dataConfig.inPlace = dataConfigurator.inplace;
dataConfig.constant = dataConfigurator.constant;
dataConfig.desc = dataConfigurator.tensorDescCreator->createDesc(precision, tensor.get_shape());
const TensorDesc& data_desc = data->getTensorDesc();
const SizeVector& data_dims = data_desc.getDims();
std::vector<size_t> blocks = data_dims;
std::vector<size_t> order(blocks.size());
for (size_t i = 0; i < order.size(); i++) order[i] = i;
const bool isInt8 = (data->getPrecision() == Precision::I8 || data->getPrecision() == Precision::U8);
if (conf.layout == ConfLayout::BLK8 || conf.layout == ConfLayout::BLK16) {
if (data_dims.size() < 4 || data_dims.size() > 5)
IE_THROW() << "Inapplicable blocking layout."
<< "Tensor should be 4D or 5D.";
int blk_size = conf.layout == ConfLayout::BLK8 ? 8 : 16;
// Blocking through Channel dimension. Like [nChwXc]
order.push_back(1);
blocks[1] = div_up(blocks[1], blk_size);
blocks.push_back(blk_size);
} else if (isInt8) {
if (data_dims.size() == 4) {
order = {0, 2, 3, 1};
blocks = {data_dims[0], data_dims[2], data_dims[3], data_dims[1]};
} else if (data_dims.size() == 5) {
order = {0, 2, 3, 4, 1};
blocks = {data_dims[0], data_dims[2], data_dims[3], data_dims[4], data_dims[1]};
} // all over keep original plain format
conf.layout = ConfLayout::PLN;
}
InferenceEngine::Precision precision = (conf.prc == Precision::UNSPECIFIED) ? data_desc.getPrecision() : Precision(conf.prc);
if (conf.layout == ConfLayout::ANY) {
dataConfig.desc = TensorDesc(precision, data_dims, InferenceEngine::Layout::ANY);
} else {
dataConfig.desc = TensorDesc(precision, data_dims, {blocks, order});
}
port.push_back(dataConfig);
return true;
};
for (size_t i = 0; i < in_l.size(); i++)
fill_port(config.inConfs, in_l[i], layer->insData[i].lock());
for (size_t i = 0; i < inDataConfigurators.size(); i++)
if (!fill_port(inDataConfigurators[i], op->get_input_tensor(i), config.inConfs))
return;
for (size_t i = 0; i < out_l.size(); i++)
fill_port(config.outConfs, out_l[i], layer->outData[i]);
for (size_t i = 0; i < outDataConfigurators.size(); i++)
if (!fill_port(outDataConfigurators[i], op->get_output_tensor(i), config.outConfs))
return;
config.dynBatchSupport = dynBatchSupport;
confs.push_back(config);
}
std::string errorMsg;
std::vector<LayerConfig> confs;
};
@ -153,20 +133,22 @@ protected:
template <class IMPL>
class ImplFactory : public ILayerImplFactory {
public:
explicit ImplFactory(const CNNLayer *layer) {
cnnLayer = InferenceEngine::clonelayer(*layer);
cnnLayer->_fusedWith = layer->_fusedWith;
cnnLayer->insData = layer->insData;
cnnLayer->outData = layer->outData;
}
explicit ImplFactory(const std::shared_ptr<ngraph::Node>& op) : ngraphOp(op) {}
// First implementation has more priority than next
StatusCode getImplementations(std::vector<ILayerImpl::Ptr>& impls, ResponseDesc *resp) noexcept override {
impls.push_back(ILayerImpl::Ptr(new IMPL(cnnLayer.get())));
try {
impls.push_back(ILayerImpl::Ptr(new IMPL(ngraphOp)));
} catch (const InferenceEngine::Exception& ex) {
strncpy(resp->msg, ex.what(), sizeof(resp->msg) - 1);
IE_SUPPRESS_DEPRECATED_START
return ex.getStatus() != OK ? ex.getStatus() : GENERAL_ERROR;
IE_SUPPRESS_DEPRECATED_END
}
return OK;
}
protected:
InferenceEngine::CNNLayerPtr cnnLayer;
const std::shared_ptr<ngraph::Node> ngraphOp;
};
#define REG_FACTORY_FOR(__prim, __type) \

View File

@ -1,244 +0,0 @@
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "base.hpp"
#include "ie_parallel.hpp"
#include <cmath>
#include <string>
#include <vector>
#include <set>
namespace InferenceEngine {
namespace Extensions {
namespace Cpu {
class BatchToSpaceImpl: public ExtLayerBase {
public:
explicit BatchToSpaceImpl(const CNNLayer *layer) {
try {
const auto batchToSpaceLayer = dynamic_cast<const BatchToSpaceLayer*>(layer);
if (!batchToSpaceLayer)
IE_THROW() << "BatchToSpace layer with name '" << layer->name << "' isn't instance of BatchToSpaceLayer class";
if (batchToSpaceLayer->insData.size() != 4)
IE_THROW() << "BatchToSpace layer with name '" << batchToSpaceLayer->name << "' has incorrect number of input edges";
if (batchToSpaceLayer->outData.size() != 1)
IE_THROW() << "BatchToSpace layer with name '" << batchToSpaceLayer->name << "' has incorrect number of output edges";
auto data = batchToSpaceLayer->insData[0].lock();
if (!data)
IE_THROW() << "BatchToSpace layer with name '" << batchToSpaceLayer->name << "' has nullable input data";
inDims = data->getTensorDesc().getDims();
if (inDims.size() < 4)
IE_THROW() << "BatchToSpace layer with name '" << batchToSpaceLayer->name << "' doesn't support dimensions with rank less than 4";
if (inDims.size() > 5)
IE_THROW() << "BatchToSpace layer with name '" << batchToSpaceLayer->name << "' doesn't support dimensions with rank greater than 5";
outDims = batchToSpaceLayer->outData[0]->getTensorDesc().getDims();
if (inDims.size() != outDims.size())
IE_THROW() << "BatchToSpace layer with name '" << batchToSpaceLayer->name << "' has incorrect number of input/output dimensions";
const auto precision = data->getTensorDesc().getPrecision();
const std::set<size_t> supported_precision_sizes = {1, 2, 4, 8};
if (supported_precision_sizes.find(precision.size()) == supported_precision_sizes.end())
IE_THROW() << "BatchToSpace layer with name '" << batchToSpaceLayer->name << "' has unsupported precision: " << precision.name();
blockShapeIn = batchToSpaceLayer->_block_shape;
cropsBeginIn = batchToSpaceLayer->_crops_begin;
auto createConfig = [&](Layout layout) {
LayerConfig config;
// TODO: remove Const layers
for (int i = 0; i < batchToSpaceLayer->insData.size(); i++) {
auto inData = batchToSpaceLayer->insData[i].lock();
if (!inData)
IE_THROW() << "BatchToSpace layer with name '" << batchToSpaceLayer->name << "' has nullable input data";
DataConfig inConfig;
if (i == 0)
inConfig.desc = TensorDesc(precision, inData->getTensorDesc().getDims(), layout);
else
inConfig.desc = TensorDesc(inData->getPrecision(), inData->getTensorDesc().getDims(), inData->getTensorDesc().getLayout());
config.inConfs.push_back(inConfig);
}
DataConfig outConfig;
outConfig.desc = TensorDesc(precision, outDims, layout);
config.outConfs.push_back(outConfig);
config.dynBatchSupport = false;
confs.push_back(config);
};
createConfig(inDims.size() == 4 ? NHWC : NDHWC);
createConfig(TensorDesc::getLayoutByDims(inDims));
std::vector<std::pair<ConfLayout, ConfLayout>> blockConfs { };
if (inDims[1] % 8 == 0) blockConfs.push_back({ConfLayout::BLK8, ConfLayout::BLK8});
if (inDims[1] % 16 == 0) blockConfs.push_back({ConfLayout::BLK16, ConfLayout::BLK16});
for (auto conf : blockConfs) {
addConfig(layer, {DataConfigurator(conf.first, precision),
DataConfigurator(ConfLayout::PLN, batchToSpaceLayer->insData[1].lock()->getPrecision()),
DataConfigurator(ConfLayout::PLN, batchToSpaceLayer->insData[2].lock()->getPrecision()),
DataConfigurator(ConfLayout::PLN, batchToSpaceLayer->insData[3].lock()->getPrecision())},
{DataConfigurator(conf.second, precision)});
}
} catch (InferenceEngine::Exception &ex) {
errorMsg = ex.what();
}
}
StatusCode execute(std::vector<Blob::Ptr> &inputs, std::vector<Blob::Ptr> &outputs, ResponseDesc *resp) noexcept override {
switch (inputs[0]->getTensorDesc().getPrecision().size()) {
case 1: batchToSpaceKernel<PrecisionTrait<Precision::U8>::value_type> (inputs, outputs); break;
case 2: batchToSpaceKernel<PrecisionTrait<Precision::U16>::value_type>(inputs, outputs); break;
case 4: batchToSpaceKernel<PrecisionTrait<Precision::I32>::value_type>(inputs, outputs); break;
default: {
if (resp) {
std::string errorMsg = "BatchToSpace layer does not support precision '"
+ std::string(inputs[0]->getTensorDesc().getPrecision().name()) + "'";
errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
return GENERAL_ERROR;
}
}
}
return OK;
}
private:
std::vector<size_t> getShape5D(const SizeVector &shape) {
std::vector<size_t> shape5D(5, 1);
for (int i = 0; i < 2; i++) {
shape5D[i] = shape[i];
shape5D[4 - i] = shape[shape.size() - 1 - i];
}
shape5D[2] = shape.size() == 5 ? shape[2] : shape5D[2];
return shape5D;
}
template<typename T>
void batchToSpaceKernel(std::vector<Blob::Ptr> &inputs, std::vector<Blob::Ptr> &outputs) noexcept {
const T *srcData = inputs[0]->cbuffer().as<const T *>() + inputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
T *dstData = outputs[0]->buffer().as<T *>() + outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
const auto layout = inputs[0]->getTensorDesc().getLayout();
const bool blocked = layout != NCHW && layout != NCDHW && layout != NHWC && layout != NDHWC;
const auto dimsSize = inDims.size();
auto inShape5D = getShape5D(inDims);
auto outShape5D = getShape5D(outDims);
auto blockShape = getShape5D(blockShapeIn);
if (layout == NHWC || layout == NDHWC) {
inShape5D.push_back(inShape5D[1]);
inShape5D.erase(inShape5D.begin() + 1);
outShape5D.push_back(outShape5D[1]);
outShape5D.erase(outShape5D.begin() + 1);
blockShape.push_back(blockShape[1]);
blockShape.erase(blockShape.begin() + 1);
}
const size_t blockSize = blocked ? outputs[0]->getTensorDesc().getBlockingDesc().getBlockDims().back() : 1lu;
const size_t blockCountInput = inputs[0]->getTensorDesc().getBlockingDesc().getBlockDims()[1];
const size_t blockCountOutput = outputs[0]->getTensorDesc().getBlockingDesc().getBlockDims()[1];
const auto blockRemainder = inShape5D[1] % blockSize;
const auto lastBlock = blockRemainder == 0 ? blockSize : blockRemainder;
const size_t inSpatialStep = inShape5D[2] * inShape5D[3] * inShape5D[4];
const size_t inBatchStep = (blocked ? blockSize * blockCountInput : inShape5D[1]) * inSpatialStep;
const size_t outSpatialStep = outShape5D[2] * outShape5D[3] * outShape5D[4];
const size_t outBatchStep = (blocked ? blockSize * blockCountOutput : outShape5D[1]) * outSpatialStep;
size_t channels = (inShape5D[1] / blockSize);
channels = channels == 0 ? 1 : channels;
const size_t workAmount = inShape5D[0] * channels;
parallel_nt(0, [&](const int ithr, const int nthr) {
size_t start(0lu), end(0lu);
splitter(workAmount, nthr, ithr, start, end);
std::vector<size_t> indxStart(2, 0);
std::vector<size_t> indxEnd(2, 0);
parallel_it_init(start, indxStart[0], inShape5D[0], indxStart[1], channels);
parallel_it_init((end - 1), indxEnd[0], inShape5D[0], indxEnd[1], channels);
std::vector<int64_t> oAdd(5, 1);
std::vector<size_t> begin(5, 0);
std::vector<size_t> finish(5, 1);
for (size_t i0 = indxStart[0]; i0 < indxEnd[0] + 1; ++i0) {
int64_t bIdx = i0 / outShape5D[0];
const size_t srcIdx0 = i0 * inBatchStep;
const size_t dstIdx0 = (i0 - (bIdx * outShape5D[0])) * outBatchStep;
oAdd[4] = bIdx % blockShapeIn[dimsSize - 1] - cropsBeginIn[dimsSize - 1];
bIdx /= blockShapeIn[dimsSize - 1];
oAdd[3] = bIdx % blockShapeIn[dimsSize - 2] - cropsBeginIn[dimsSize - 2];
bIdx /= blockShapeIn[dimsSize - 2];
oAdd[2] = dimsSize == 5 ? bIdx % blockShapeIn[2] - cropsBeginIn[2] : 0lu;
bIdx = dimsSize == 5 ? bIdx / blockShapeIn[2] : bIdx;
oAdd[1] = bIdx % blockShapeIn[1] - cropsBeginIn[1];
if (layout == NHWC || layout == NDHWC) {
oAdd.push_back(oAdd[1]);
oAdd.erase(oAdd.begin() + 1);
}
begin[1] = (blockShape[1] - 1 - oAdd[1]) / blockShape[1] / blockSize;
finish[1] = (outShape5D[1] - 1 - oAdd[1]) / blockShape[1] / blockSize;
begin[2] = (blockShape[2] - 1 - oAdd[2]) / blockShape[2];
finish[2] = (outShape5D[2] - 1 - oAdd[2]) / blockShape[2];
begin[3] = (blockShape[3] - 1 - oAdd[3]) / blockShape[3];
finish[3] = (outShape5D[3] - 1 - oAdd[3]) / blockShape[3];
begin[4] = (blockShape[4] - 1 - oAdd[4]) / blockShape[4];
finish[4] = (outShape5D[4] - 1 - oAdd[4]) / blockShape[4];
const int64_t addTmpOC = blocked ? 0lu : oAdd[1];
const int64_t addTmpOc = blocked ? oAdd[1] : 0lu;
indxStart[1] = begin[1] > indxStart[1] ? begin[1] : indxStart[1];
const size_t lastI1 = i0 == indxEnd[0] ? (indxEnd[1] > finish[1] ? finish[1] : indxEnd[1]) : finish[1];
for (; indxStart[1] < lastI1 + 1; ++indxStart[1]) {
const size_t block = indxStart[1] == finish[1] ? lastBlock : blockSize;
const int64_t tmpOC = indxStart[1] * blockShape[1] + addTmpOC;
const size_t srcIdx1 = srcIdx0 + indxStart[1] * inSpatialStep * blockSize;
const size_t dstIdx1 = dstIdx0 + tmpOC * outSpatialStep * blockSize;
const size_t itEnd = blocked ? ((block - 1) * blockShape[1] + oAdd[1]) / blockSize : 0lu;
for (size_t i2 = begin[2]; i2 < finish[2] + 1; ++i2) {
const int64_t tmpOd = i2 * blockShape[2] + oAdd[2];
const size_t srcIdx2 = srcIdx1 + i2 * inShape5D[3] * inShape5D[4] * blockSize;
const size_t dstIdx2 = dstIdx1 + tmpOd * outShape5D[3] * outShape5D[4] * blockSize;
for (size_t i3 = begin[3]; i3 < finish[3] + 1; ++i3) {
const int64_t tmpOh = i3 * blockShape[3] + oAdd[3];
const size_t srcIdx3 = srcIdx2 + i3 * inShape5D[4] * blockSize;
const size_t dstIdx3 = dstIdx2 + tmpOh * outShape5D[4] * blockSize;
for (size_t i4 = begin[4]; i4 < finish[4] + 1; ++i4) {
const int64_t tmpOw = i4 * blockShape[4] + oAdd[4];
const size_t srcIdx4 = srcIdx3 + i4 * blockSize;
const size_t dstIdx4 = dstIdx3 + tmpOw * blockSize;
for (size_t it = 0; it < itEnd + 1; ++it) {
const size_t i5Begin = it == 0 ? 0 : (it * blockSize - 1 - oAdd[1]) / blockShape[1] + 1;
const size_t i5End = it == itEnd ? (block - 1) : ((it + 1) * blockSize - 1 - oAdd[1]) / blockShape[1];
for (size_t i5 = i5Begin; i5 < i5End + 1; ++i5) {
const int64_t tmpOc = i5 * blockShape[1] + addTmpOc;
const size_t srcIdx5 = srcIdx4 + i5;
const size_t dstIdx5 = dstIdx4 + it * outSpatialStep * blockSize + (tmpOc - it * blockSize);
dstData[dstIdx5] = srcData[srcIdx5];
}
}
}
}
}
}
indxStart[1] = 0lu;
}
});
}
SizeVector inDims;
SizeVector outDims;
std::vector<size_t> blockShapeIn;
std::vector<size_t> cropsBeginIn;
};
REG_FACTORY_FOR(BatchToSpaceImpl, BatchToSpace);
} // namespace Cpu
} // namespace Extensions
} // namespace InferenceEngine

View File

@ -1,135 +0,0 @@
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "base.hpp"
#include <cmath>
#include <string>
#include <vector>
#include <cassert>
#include "ie_parallel.hpp"
#include "common/cpu_memcpy.h"
namespace InferenceEngine {
namespace Extensions {
namespace Cpu {
class BroadcastImpl: public ExtLayerBase {
public:
explicit BroadcastImpl(const CNNLayer* layer) {
try {
if (layer->insData.empty() || layer->outData.empty())
IE_THROW() << layer->name << " Incorrect number of input/output edges!";
if (layer->insData.size() != 2)
IE_THROW() << layer->name << " Incorrect number of input edges!";
SizeVector shape_dims = layer->insData[BROADCAST_SHAPE].lock()->getTensorDesc().getDims();
if (shape_dims.size() > 1)
IE_THROW() << layer->name << " Shape vector should be 1 dimension";
LayerConfig config;
DataConfig dataConfig, shapeConfig;
Precision dataPrecision = layer->insData[BROADCAST_INPUT].lock()->getTensorDesc().getPrecision();
const SizeVector& data_dims = layer->insData[BROADCAST_INPUT].lock()->getTensorDesc().getDims();
dataConfig.desc = TensorDesc(dataPrecision, data_dims,
layer->insData[BROADCAST_INPUT].lock()->getTensorDesc().getLayout());
config.inConfs.push_back(dataConfig);
shapeConfig.desc = TensorDesc(layer->insData[BROADCAST_SHAPE].lock()->getTensorDesc().getPrecision(),
shape_dims, TensorDesc::getLayoutByDims(shape_dims));
config.inConfs.push_back(shapeConfig);
DataConfig outConfig;
const SizeVector& out_dims = layer->outData[0]->getTensorDesc().getDims();
outConfig.desc = TensorDesc(dataPrecision, out_dims, layer->outData[0]->getTensorDesc().getLayout());
config.outConfs.push_back(outConfig);
config.dynBatchSupport = false;
confs.push_back(config);
} catch (InferenceEngine::Exception &ex) {
errorMsg = ex.what();
}
}
StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, ResponseDesc *resp) noexcept override {
size_t shape_size = (inputs[BROADCAST_SHAPE]->getTensorDesc().getDims())[0];
SizeVector dst_dims = outputs[0]->getTensorDesc().getDims();
SizeVector src_dims = inputs[BROADCAST_INPUT]->getTensorDesc().getDims();
SizeVector srcStrides = inputs[BROADCAST_INPUT]->getTensorDesc().getBlockingDesc().getStrides();
size_t data_size = inputs[BROADCAST_INPUT]->getTensorDesc().getPrecision().size();
if (!src_dims.size())
src_dims = SizeVector(1, 1);
if (!srcStrides.size())
srcStrides = SizeVector(1, 1);
if (dst_dims.size() != shape_size) {
if (resp) {
std::string errorMsg = "Output tensor dimension mismatch";
errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
}
return PARAMETER_MISMATCH;
}
if (src_dims.size() > dst_dims.size()) {
if (resp) {
std::string errorMsg = "Output tensor dimension is smaller then input tensor dimension";
errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
}
return PARAMETER_MISMATCH;
}
InferenceEngine::SizeVector dstStrides = outputs[0]->getTensorDesc().getBlockingDesc().getStrides();
InferenceEngine::SizeVector src_aligned(dst_dims.size());
InferenceEngine::SizeVector srcStrides_aligned(dst_dims.size());
size_t prefix_size = dst_dims.size() - src_dims.size();
for (size_t i = 0; i < dst_dims.size(); i++) {
if (i < prefix_size) {
src_aligned[i] = 1;
srcStrides_aligned[i] = srcStrides[0];
} else {
src_aligned[i] = src_dims[i - prefix_size];
srcStrides_aligned[i] = srcStrides[i - prefix_size];
}
}
size_t work_amount_dst = dstStrides[0] * dst_dims[0];
const uint8_t *src_data = inputs[BROADCAST_INPUT]->cbuffer().as<const uint8_t *>() +
inputs[BROADCAST_INPUT]->getTensorDesc().getBlockingDesc().getOffsetPadding();
uint8_t* dst_data = outputs[0]->cbuffer().as<uint8_t *>() +
outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
parallel_nt(0, [&](const int ithr, const int nthr) {
size_t i, src_idx, start = 0, end = 0;
SizeVector counters(dst_dims.size(), 0);
splitter(work_amount_dst, nthr, ithr, start, end);
for (int j = dst_dims.size() - 1, i = start; j >= 0; j--) {
counters[j] = i % dst_dims[j];
i /= dst_dims[j];
}
for (size_t iwork = start * data_size; iwork < end * data_size; iwork += data_size) {
for (i = 0, src_idx = 0; i < dst_dims.size(); ++i)
src_idx += counters[i] ? ((counters[i] % src_aligned[i]) * srcStrides_aligned[i]) : 0;
cpu_memcpy(&dst_data[iwork], &src_data[src_idx * data_size], data_size);
for (int j = dst_dims.size() - 1; j >= 0; j--) {
counters[j] = (counters[j] + 1) % dst_dims[j];
if (counters[j] != 0) break;
}
}
});
return OK;
}
private:
const size_t BROADCAST_INPUT = 0;
const size_t BROADCAST_SHAPE = 1;
};
REG_FACTORY_FOR(BroadcastImpl, Broadcast);
} // namespace Cpu
} // namespace Extensions
} // namespace InferenceEngine

View File

@ -12,55 +12,72 @@
#include <algorithm>
#include <limits>
#include "ie_parallel.hpp"
#include <ngraph/opsets/opset3.hpp>
using namespace MKLDNNPlugin;
namespace InferenceEngine {
namespace Extensions {
namespace Cpu {
class BucketizeImpl : public ExtLayerBase {
public:
explicit BucketizeImpl(const CNNLayer* layer) {
bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
try {
if (layer->insData.size() != 2 || layer->outData.size() != 1) {
IE_THROW() << layer->name << " Incorrect number of input/output edges!";
const auto bucketsize = std::dynamic_pointer_cast<const ngraph::opset3::Bucketize>(op);
if (!bucketsize) {
errorMessage = "Only opset3 Bucketize operation is supported";
return false;
}
} catch (...) {
return false;
}
return true;
}
std::string errorPrefix;
public:
explicit BucketizeImpl(const std::shared_ptr<ngraph::Node>& op) {
try {
std::string errorMessage;
if (!isSupportedOperation(op, errorMessage)) {
IE_THROW(NotImplemented) << errorMessage;
}
errorPrefix = "Bucketize layer with name '" + op->get_friendly_name() + "' ";
const auto bucketsize = std::dynamic_pointer_cast<const ngraph::opset3::Bucketize>(op);
if (op->get_input_size() != 2 || op->get_output_size() != 1) {
IE_THROW() << errorPrefix << " has incorrect number of input/output edges!";
}
// check one attribute
with_right = layer->GetParamAsBool("with_right_bound");
auto input = layer->insData[INPUT_TENSOR_PORT].lock();
if (!input) {
IE_THROW() << "Missing input for " << layer->name << " layer";
}
auto boundaries = layer->insData[INPUT_BINS_PORT].lock();
if (!boundaries) {
IE_THROW() << "Missing boundaries input for " << layer->name << " layer";
}
with_right = bucketsize->get_with_right_bound();
// check precisions for input and output tensors
input_precision = input->getTensorDesc().getPrecision();
input_precision = details::convertPrecision(op->get_input_element_type(INPUT_TENSOR_PORT));
if (input_precision != Precision::FP32 && input_precision != Precision::I32 &&
input_precision != Precision::I64) {
input_precision = Precision::FP32;
}
boundaries_precision = boundaries->getTensorDesc().getPrecision();
boundaries_precision = details::convertPrecision(op->get_input_element_type(INPUT_BINS_PORT));
if (boundaries_precision != Precision::FP32 && boundaries_precision != Precision::I32 &&
boundaries_precision != Precision::I64) {
boundaries_precision = Precision::FP32;
}
output_precision = layer->outData[OUTPUT_TENSOR_PORT]->getTensorDesc().getPrecision();
output_precision = details::convertPrecision(op->get_output_element_type(OUTPUT_TENSOR_PORT));
if (output_precision != Precision::I32 && output_precision != Precision::I64) {
output_precision = Precision::I32;
}
// check dimensions of input tensors
SizeVector input_tensor_dims = input->getTensorDesc().getDims();
SizeVector input_tensor_dims = op->get_input_shape(INPUT_TENSOR_PORT);
if (input_tensor_dims.size() < 1) {
IE_THROW() << layer->name << " Incorrect dimensions of the input.";
IE_THROW() << errorPrefix << " has incorrect dimensions of the input.";
}
SizeVector input_bin_dims = boundaries->getTensorDesc().getDims();
SizeVector input_bin_dims = op->get_input_shape(INPUT_BINS_PORT);
if (input_bin_dims.size() != 1) {
IE_THROW() << layer->name << " Incorrect dimensions of the boundaries tensor.";
IE_THROW() << errorPrefix << " has incorrect dimensions of the boundaries tensor.";
}
if (input_bin_dims[0] != 0) {
with_bins = true;
@ -69,9 +86,9 @@ public:
num_values = std::accumulate(input_tensor_dims.begin(), input_tensor_dims.end(), 1, std::multiplies<size_t>());
addConfig(layer,
{ DataConfigurator(ConfLayout::PLN, input_precision), DataConfigurator(ConfLayout::PLN, boundaries_precision) },
{ DataConfigurator(ConfLayout::PLN, output_precision) });
addConfig(op, {{TensorDescCreatorTypes::ncsp, input_precision},
{TensorDescCreatorTypes::ncsp, boundaries_precision}},
{{TensorDescCreatorTypes::ncsp, output_precision}});
}
catch (InferenceEngine::Exception &ex) {
errorMsg = ex.what();

View File

@ -4,45 +4,68 @@
#include "base.hpp"
#include "ie_parallel.hpp"
#include <ngraph/op/ctc_greedy_decoder.hpp>
#include <nodes/common/tensor_desc_creator.h>
#include <vector>
#include <string>
#include <vector>
namespace InferenceEngine {
namespace Extensions {
namespace Cpu {
using MKLDNNPlugin::TensorDescCreatorTypes;
class CTCGreedyDecoderImpl: public ExtLayerBase {
public:
explicit CTCGreedyDecoderImpl(const CNNLayer* layer) : mergeRepeated_(true) {
std::string errPrefix = "CTCGreedyDecoder layer with name '" + layer->name + "' ";
if (layer->insData.size() != 2)
IE_THROW() << errPrefix << "has invalid number of input edges: " << layer->insData.size();
if (layer->outData.size() != 1)
IE_THROW() << errPrefix << "has invalid number of outputs edges: " << layer->outData.size();
static bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
try {
auto greedyDecOp = ngraph::as_type_ptr<const ngraph::op::v0::CTCGreedyDecoder>(op);
if (!greedyDecOp) {
errorMessage = "Node is not an instance of the CTCGreedyDecoder operation from operation set v0.";
return false;
}
} catch (...) {
return false;
}
auto inData = layer->insData[DATA_INDEX].lock();
auto sequenceLenData = layer->insData[SEQUENCE_LENGTH_INDEX].lock();
if (!inData || !sequenceLenData)
IE_THROW() << errPrefix << "has nullable inputs.";
if (inData->getTensorDesc().getDims()[0] != sequenceLenData->getTensorDesc().getDims()[0] &&
inData->getTensorDesc().getDims()[1] != sequenceLenData->getTensorDesc().getDims()[1])
IE_THROW() << errPrefix << "has invalid input shapes.";
if (inData->getTensorDesc().getPrecision() != Precision::FP32 &&
inData->getTensorDesc().getPrecision() != Precision::BF16)
IE_THROW() << errPrefix << "has unsupported 'data' input precision: " << inData->getTensorDesc().getPrecision();
if (sequenceLenData->getTensorDesc().getPrecision() != Precision::FP32 &&
inData->getTensorDesc().getPrecision() != Precision::BF16)
IE_THROW() << errPrefix << "has unsupported 'sequence_length' input precision: " << sequenceLenData->getTensorDesc().getPrecision();
return true;
}
std::vector<DataConfigurator> inputConfigs{{ConfLayout::PLN, Precision::FP32}, {ConfLayout::PLN, Precision::FP32}};
std::vector<DataConfigurator> outputConfigs{{ConfLayout::PLN, Precision::FP32}};
addConfig(layer, inputConfigs, outputConfigs);
explicit CTCGreedyDecoderImpl(const std::shared_ptr<ngraph::Node>& op) : mergeRepeated_(true) {
try {
std::string errorMessage;
if (!isSupportedOperation(op, errorMessage)) {
IE_THROW(NotImplemented) << errorMessage;
}
if (layer->CheckParamPresence("ctc_merge_repeated")) {
mergeRepeated_ = layer->GetParamAsBool("ctc_merge_repeated");
} else if (layer->CheckParamPresence("merge_repeated")) {
mergeRepeated_ = layer->GetParamAsBool("merge_repeated", true);
std::string errPrefix = "CTCGreedyDecoder layer with name '" + op->get_friendly_name() + "' ";
if (op->get_input_size() != 2)
IE_THROW() << errPrefix << "has invalid number of input edges: " << op->get_input_size();
if (op->get_output_size() != 1)
IE_THROW() << errPrefix << "has invalid number of outputs edges: " << op->get_output_size();
if (op->get_input_shape(DATA_INDEX)[0] != op->get_input_shape(SEQUENCE_LENGTH_INDEX)[0] &&
op->get_input_shape(DATA_INDEX)[1] != op->get_input_shape(SEQUENCE_LENGTH_INDEX)[1])
IE_THROW() << errPrefix << "has invalid input shapes.";
Precision inDataPrecision = details::convertPrecision(op->get_input_element_type(DATA_INDEX));
if (inDataPrecision != Precision::FP32 && inDataPrecision != Precision::BF16)
IE_THROW() << errPrefix << "has unsupported 'data' input precision: " << inDataPrecision;
Precision seqLenPrecision = details::convertPrecision(op->get_input_element_type(SEQUENCE_LENGTH_INDEX));
if (seqLenPrecision != Precision::FP32 && seqLenPrecision != Precision::BF16)
IE_THROW() << errPrefix << "has unsupported 'sequence_length' input precision: " << seqLenPrecision;
auto greedyDecOp = ngraph::as_type_ptr<const ngraph::op::v0::CTCGreedyDecoder>(op);
mergeRepeated_ = greedyDecOp->get_ctc_merge_repeated();
addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32},
{TensorDescCreatorTypes::ncsp, Precision::FP32}},
{{TensorDescCreatorTypes::ncsp, Precision::FP32}});
} catch (InferenceEngine::Exception &ex) {
errorMsg = ex.what();
throw;
}
}

View File

@ -4,51 +4,81 @@
#include "base.hpp"
#include "ie_parallel.hpp"
#include <ngraph/op/ctc_greedy_decoder_seq_len.hpp>
#include <nodes/common/tensor_desc_creator.h>
#include <vector>
#include <string>
#include <vector>
namespace InferenceEngine {
namespace Extensions {
namespace Cpu {
using MKLDNNPlugin::TensorDescCreatorTypes;
class CTCGreedyDecoderSeqLenImpl: public ExtLayerBase {
public:
explicit CTCGreedyDecoderSeqLenImpl(const CNNLayer* layer) : mergeRepeated_(true) {
errPrefix = "CTCGreedyDecoderSeqLen layer with name '" + layer->name + "' ";
if (layer->insData.size() < 2 || layer->insData.size() > 3)
IE_THROW() << errPrefix << "has invalid number of input edges: " << layer->insData.size();
if (layer->outData.size() != 2)
IE_THROW() << errPrefix << "has invalid number of outputs edges: " << layer->outData.size();
auto inData = layer->insData[DATA_INDEX].lock();
auto sequenceLenData = layer->insData[SEQUENCE_LENGTH_INDEX].lock();
if (!inData || !sequenceLenData)
IE_THROW() << errPrefix << "has nullable inputs.";
if (inData->getTensorDesc().getDims()[0] != sequenceLenData->getTensorDesc().getDims()[0])
IE_THROW() << errPrefix << "has invalid input shapes.";
if (inData->getTensorDesc().getPrecision() != Precision::FP32 &&
inData->getTensorDesc().getPrecision() != Precision::BF16)
IE_THROW() << errPrefix << "has unsupported 'data' input precision: " << inData->getTensorDesc().getPrecision();
if (sequenceLenData->getTensorDesc().getPrecision() != Precision::I32 &&
sequenceLenData->getTensorDesc().getPrecision() != Precision::I64)
IE_THROW() << errPrefix << "has unsupported 'sequence_length' input precision: " << sequenceLenData->getTensorDesc().getPrecision();
std::vector<DataConfigurator> inputConfigs{{ConfLayout::PLN, Precision::FP32}, {ConfLayout::PLN, Precision::I32}};
if (layer->insData.size() > BLANK_INDEX) {
auto blankIndexData = layer->insData[BLANK_INDEX].lock();
if (!blankIndexData)
IE_THROW() << errPrefix << "has nullable inputs.";
if (blankIndexData->getTensorDesc().getPrecision() != Precision::I32 &&
blankIndexData->getTensorDesc().getPrecision() != Precision::I64)
IE_THROW() << errPrefix << "has unsupported 'blank_index' input precision: " << blankIndexData->getTensorDesc().getPrecision();
inputConfigs.push_back({ConfLayout::PLN, Precision::I32});
static bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
try {
auto greedyDecOp = ngraph::as_type_ptr<const ngraph::op::v6::CTCGreedyDecoderSeqLen>(op);
if (!greedyDecOp) {
errorMessage = "Node is not an instance of the CTCGreedyDecoderSeqLen operation from operation set v6.";
return false;
}
} catch (...) {
return false;
}
std::vector<DataConfigurator> outputConfigs{{ConfLayout::PLN, Precision::I32}, {ConfLayout::PLN, Precision::I32}};
addConfig(layer, inputConfigs, outputConfigs);
mergeRepeated_ = layer->GetParamAsBool("merge_repeated", true);
return true;
}
explicit CTCGreedyDecoderSeqLenImpl(const std::shared_ptr<ngraph::Node>& op) : mergeRepeated_(true) {
try {
std::string errorMessage;
if (!isSupportedOperation(op, errorMessage)) {
IE_THROW(NotImplemented) << errorMessage;
}
std::string errPrefix = "CTCGreedyDecoderSeqLen layer with name '" + op->get_friendly_name() + "' ";
if (op->get_input_size() < 2 || op->get_input_size() > 3)
IE_THROW() << errPrefix << "has invalid number of input edges: " << op->get_input_size();
if (op->get_output_size() != 2)
IE_THROW() << errPrefix << "has invalid number of outputs edges: " << op->get_output_size();
if (op->get_input_shape(DATA_INDEX)[0] != op->get_input_shape(SEQUENCE_LENGTH_INDEX)[0])
IE_THROW() << errPrefix << "has invalid input shapes.";
Precision inDataPrecision = details::convertPrecision(op->get_input_element_type(DATA_INDEX));
if (inDataPrecision != Precision::FP32 && inDataPrecision != Precision::BF16)
IE_THROW() << errPrefix << "has unsupported 'data' input precision: " << inDataPrecision;
Precision seqLenPrecision = details::convertPrecision(op->get_input_element_type(SEQUENCE_LENGTH_INDEX));
if (seqLenPrecision != Precision::I32 && seqLenPrecision != Precision::I64)
IE_THROW() << errPrefix << "has unsupported 'sequence_length' input precision: " << seqLenPrecision;
auto greedyDecOp = ngraph::as_type_ptr<const ngraph::op::v6::CTCGreedyDecoderSeqLen>(op);
mergeRepeated_ = greedyDecOp->get_merge_repeated();
if (op->get_input_size() == BLANK_INDEX) {
addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32},
{TensorDescCreatorTypes::ncsp, Precision::I32}},
{{TensorDescCreatorTypes::ncsp, Precision::I32},
{TensorDescCreatorTypes::ncsp, Precision::I32}});
} else {
Precision blIdxPrecision = details::convertPrecision(op->get_input_element_type(BLANK_INDEX));
if (blIdxPrecision != Precision::I32 && blIdxPrecision != Precision::I64)
IE_THROW() << errPrefix << "has unsupported 'blank_index' input precision: " << blIdxPrecision;
addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32},
{TensorDescCreatorTypes::ncsp, Precision::I32},
{TensorDescCreatorTypes::ncsp, Precision::I32}},
{{TensorDescCreatorTypes::ncsp, Precision::I32},
{TensorDescCreatorTypes::ncsp, Precision::I32}});
}
} catch (InferenceEngine::Exception &ex) {
errorMsg = ex.what();
throw;
}
}
StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs,

View File

@ -4,6 +4,8 @@
#include "base.hpp"
#include "ie_parallel.hpp"
#include <ngraph/op/ctc_loss.hpp>
#include <nodes/common/tensor_desc_creator.h>
#include <cmath>
@ -12,46 +14,52 @@ namespace InferenceEngine {
namespace Extensions {
namespace Cpu {
using MKLDNNPlugin::TensorDescCreatorTypes;
class CTCLossImpl : public ExtLayerBase {
public:
explicit CTCLossImpl(const CNNLayer* layer) {
_logPrefix = std::string("CTCLoss layer with name '") + layer->name + "'";
if (layer->insData.size() != 4 && layer->insData.size() != 5)
IE_THROW() << _logPrefix << " has invalid inputs number.";
_ctcMergeRepeated = layer->GetParamAsBool("ctc_merge_repeated", true);
_preprocessCollapseRepeated = layer->GetParamAsBool("preprocess_collapse_repeated", false);
_unique = layer->GetParamAsBool("unique", false);
auto logitsData = layer->insData[0].lock();
if (logitsData == nullptr)
IE_THROW() << _logPrefix << " has nullable logits data";
LayerConfig config;
config.inConfs.resize(layer->insData.size());
config.inConfs[0].desc = TensorDesc(Precision::FP32,
logitsData->getTensorDesc().getDims(),
TensorDesc::getLayoutByDims(logitsData->getTensorDesc().getDims()));
auto intPrecision = Precision::I32;
for (int i = 1; i < layer->insData.size(); i++) {
auto data = layer->insData[i].lock();
if (data == nullptr)
IE_THROW() << _logPrefix << " has nullable input data at " << i;
config.inConfs[i].desc = TensorDesc(intPrecision,
data->getTensorDesc().getDims(),
TensorDesc::getLayoutByDims(data->getTensorDesc().getDims()));
static bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
try {
auto ctcLossOp = ngraph::as_type_ptr<const ngraph::op::v4::CTCLoss>(op);
if (!ctcLossOp) {
errorMessage = "Node is not an instance of the CTCLoss operation from operation set v4.";
return false;
}
} catch (...) {
return false;
}
DataConfig outConfig;
auto& outDims = layer->outData[0]->getTensorDesc().getDims();
outConfig.desc = TensorDesc(Precision::FP32,
outDims,
TensorDesc::getLayoutByDims(outDims));
config.outConfs.push_back(outConfig);
config.dynBatchSupport = false;
return true;
}
confs.push_back(config);
explicit CTCLossImpl(const std::shared_ptr<ngraph::Node>& op) {
try {
std::string errorMessage;
if (!isSupportedOperation(op, errorMessage)) {
IE_THROW(NotImplemented) << errorMessage;
}
_logPrefix = std::string("CTCLoss layer with name '") + op->get_friendly_name() + "'";
if (op->get_input_size() != 4 && op->get_input_size() != 5)
IE_THROW() << _logPrefix << " has invalid inputs number.";
auto ctcLossOp = ngraph::as_type_ptr<const ngraph::op::v4::CTCLoss>(op);
_ctcMergeRepeated = ctcLossOp->get_ctc_merge_repeated();
_preprocessCollapseRepeated = ctcLossOp->get_preprocess_collapse_repeated();
_unique = ctcLossOp->get_unique();
std::vector<DataConfigurator> inDataConfigurators;
inDataConfigurators.push_back({TensorDescCreatorTypes::ncsp, Precision::FP32});
for (int i = 1; i < op->get_input_size(); i++) {
inDataConfigurators.push_back({TensorDescCreatorTypes::ncsp, Precision::I32});
}
addConfig(op, inDataConfigurators,
{{TensorDescCreatorTypes::ncsp, Precision::FP32}});
} catch (InferenceEngine::Exception &ex) {
errorMsg = ex.what();
throw;
}
}
StatusCode execute(std::vector<Blob::Ptr>& inputs,

View File

@ -9,11 +9,16 @@
#include <vector>
#include "ie_parallel.hpp"
#include "ie_precision.hpp"
#include <ngraph/opsets/opset1.hpp>
#include <ngraph/opsets/opset3.hpp>
#include <ie_ngraph_utils.hpp>
namespace InferenceEngine {
namespace Extensions {
namespace Cpu {
using MKLDNNPlugin::TensorDescCreatorTypes;
class CumSumImpl: public ExtLayerBase {
enum { CUM_SUM_DATA, AXIS, numOfInputs };
bool exclusive;
@ -22,71 +27,67 @@ class CumSumImpl: public ExtLayerBase {
size_t axis = 0;
std::vector<size_t> shape;
public:
explicit CumSumImpl(const CNNLayer* layer) {
bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
try {
layerName = layer->name;
if ((layer->insData.size() != numOfInputs && layer->insData.size() != (numOfInputs - 1)) || layer->outData.size() != 1)
const auto cumsum = std::dynamic_pointer_cast<const ngraph::opset3::CumSum>(op);
if (!cumsum) {
errorMessage = "Only opset3 CumSum operation is supported";
return false;
}
} catch (...) {
return false;
}
return true;
}
public:
explicit CumSumImpl(const std::shared_ptr<ngraph::Node>& op) {
try {
std::string errorMessage;
if (!isSupportedOperation(op, errorMessage)) {
IE_THROW(NotImplemented) << errorMessage;
}
layerName = op->get_friendly_name();
if ((op->get_input_size() != numOfInputs && op->get_input_size() != (numOfInputs - 1)) || op->get_output_size() != 1)
IE_THROW() << "CumSum layer with name '" << layerName << "' has incorrect number of input/output edges!";
const auto &dataTensor = layer->insData[CUM_SUM_DATA].lock()->getTensorDesc();
const auto &dataShape = dataTensor.getDims();
const auto &dataShape = op->get_input_shape(CUM_SUM_DATA);
if (dataShape.size() < 1) {
IE_THROW() << "CumSum layer with name '" << layerName << "' doesn't support 'data' input tensor with rank: " << dataShape.size();
}
numOfDims = dataShape.size();
exclusive = layer->GetParamAsBool("exclusive", false);
reverse = layer->GetParamAsBool("reverse", false);
const auto cumsum = std::dynamic_pointer_cast<const ngraph::opset3::CumSum>(op);
exclusive = cumsum->is_exclusive();
reverse = cumsum->is_reverse();
const auto& dataPrecision = dataTensor.getPrecision();
auto dataPrecision = details::convertPrecision(cumsum->get_input_element_type(CUM_SUM_DATA));
if (dataPrecision != Precision::I8 && dataPrecision != Precision::U8 && dataPrecision != Precision::I16 && dataPrecision != Precision::I32 &&
dataPrecision != Precision::FP32 && dataPrecision != Precision::I64 && dataPrecision != Precision::U64 && dataPrecision != Precision::BF16)
IE_THROW() << "CumSum layer with name '" << layerName << "' has unsupported 'data' input precision: " << dataPrecision.name();
if (layer->insData.size() == numOfInputs) {
const auto& axisTensor = layer->insData[AXIS].lock()->getTensorDesc();
const auto& axisTensorPrec = layer->insData[AXIS].lock()->getTensorDesc().getPrecision();
if (cumsum->get_input_size() == numOfInputs) {
const auto& axisTensorPrec = details::convertPrecision(cumsum->get_input_element_type(AXIS));
if (axisTensorPrec != Precision::I32 && axisTensorPrec != Precision::I64)
IE_THROW() << "CumSum layer with name '" << layerName << "' has unsupported 'axis' input precision: " << axisTensorPrec.name();
const auto axisTensorRank = axisTensor.getDims().size();
if (axisTensorRank != 0)
IE_THROW() << "CumSum layer with name '" << layerName << "' doesn't support 'axis' input tensor with rank: " << axisTensorRank;
if (!ngraph::is_scalar(cumsum->get_input_shape(AXIS)))
IE_THROW() << "CumSum layer with name '" << layerName << "' doesn't support 'axis' input tensor with non scalar rank";
}
if (dataShape != layer->outData[0]->getTensorDesc().getDims())
if (dataShape != cumsum->get_output_shape(0))
IE_THROW() << "CumSum layer with name '" << layerName << "' has different 'data' input and output dimensions";
shape = dataShape;
LayerConfig config;
for (size_t i = 0; i < layer->insData.size(); i++) {
DataConfig inConfig;
inConfig.inPlace = -1;
inConfig.constant = false;
Precision inPrecision = i == 1 ? Precision(Precision::I32) : layer->insData[i].lock()->getTensorDesc().getPrecision();
if (inPrecision == Precision::BF16)
inPrecision = Precision::FP32;
const SizeVector& inDims = layer->insData[i].lock()->getTensorDesc().getDims();
inConfig.desc = TensorDesc(inPrecision, inDims, InferenceEngine::TensorDesc::getLayoutByDims(inDims));
config.inConfs.push_back(inConfig);
}
DataConfig outConfig;
outConfig.inPlace = -1;
outConfig.constant = false;
Precision outPrecision = layer->insData[CUM_SUM_DATA].lock()->getTensorDesc().getPrecision();
if (outPrecision == Precision::BF16)
outPrecision = Precision::FP32;
const SizeVector& outDims = layer->outData[0]->getTensorDesc().getDims();
outConfig.desc = TensorDesc(outPrecision, outDims, InferenceEngine::TensorDesc::getLayoutByDims(outDims));
config.outConfs.push_back(outConfig);
config.dynBatchSupport = false;
confs.push_back(config);
std::vector<DataConfigurator> inDataConfigurators;
if (dataPrecision == Precision::BF16)
dataPrecision = Precision::FP32;
inDataConfigurators.push_back({TensorDescCreatorTypes::ncsp, dataPrecision});
if (op->get_input_size() > 1)
inDataConfigurators.push_back({TensorDescCreatorTypes::ncsp, Precision::I32});
addConfig(op, inDataConfigurators, {{TensorDescCreatorTypes::ncsp, dataPrecision}});
} catch (InferenceEngine::Exception &ex) {
errorMsg = ex.what();
}

View File

@ -10,12 +10,17 @@
#include <string>
#include <utility>
#include <algorithm>
#include "caseless.hpp"
#include "ie_parallel.hpp"
#include "common/tensor_desc_creator.h"
#include <ngraph/op/detection_output.hpp>
namespace InferenceEngine {
namespace Extensions {
namespace Cpu {
using MKLDNNPlugin::TensorDescCreatorTypes;
template <typename T>
static bool SortScorePairDescend(const std::pair<float, T>& pair1,
const std::pair<float, T>& pair2) {
@ -24,98 +29,95 @@ static bool SortScorePairDescend(const std::pair<float, T>& pair1,
class DetectionOutputImpl: public ExtLayerBase {
public:
explicit DetectionOutputImpl(const CNNLayer* layer) {
bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
try {
if (layer->insData.size() != 3 && layer->insData.size() != 5)
IE_THROW() << "Incorrect number of input edges for layer " << layer->name;
if (layer->outData.empty())
IE_THROW() << "Incorrect number of output edges for layer " << layer->name;
auto doOp = ngraph::as_type_ptr<const ngraph::op::v0::DetectionOutput>(op);
if (!doOp) {
errorMessage = "Node is not an instance of the DetectionOutput from the operations set v0.";
return false;
}
if (!details::CaselessEq<std::string>()(doOp->get_attrs().code_type, "caffe.PriorBoxParameter.CENTER_SIZE") &&
!details::CaselessEq<std::string>()(doOp->get_attrs().code_type, "caffe.PriorBoxParameter.CORNER")) {
errorMessage = "Unsupported code_type attribute.";
return false;
}
} catch (...) {
return false;
}
return true;
}
_num_classes = layer->GetParamAsInt("num_classes");
_background_label_id = layer->GetParamAsInt("background_label_id", 0);
_top_k = layer->GetParamAsInt("top_k", -1);
_variance_encoded_in_target = layer->GetParamAsBool("variance_encoded_in_target", false);
_keep_top_k = layer->GetParamAsInt("keep_top_k", -1);
_nms_threshold = layer->GetParamAsFloat("nms_threshold");
_confidence_threshold = layer->GetParamAsFloat("confidence_threshold", -FLT_MAX);
_share_location = layer->GetParamAsBool("share_location", true);
_clip_before_nms = layer->GetParamAsBool("clip_before_nms", false) ||
layer->GetParamAsBool("clip", false); // for backward compatibility
_clip_after_nms = layer->GetParamAsBool("clip_after_nms", false);
_decrease_label_id = layer->GetParamAsBool("decrease_label_id", false);
_normalized = layer->GetParamAsBool("normalized", true);
_image_height = layer->GetParamAsInt("input_height", 1);
_image_width = layer->GetParamAsInt("input_width", 1);
explicit DetectionOutputImpl(const std::shared_ptr<ngraph::Node>& op) {
try {
std::string errorMessage;
if (!isSupportedOperation(op, errorMessage)) {
IE_THROW(NotImplemented) << errorMessage;
}
if (op->get_input_size() != 3 && op->get_input_size() != 5)
IE_THROW() << "Invalid number of input edges.";
if (op->get_output_size() != 1)
IE_THROW() << "Invalid number of output edges.";
auto doOp = ngraph::as_type_ptr<const ngraph::op::v0::DetectionOutput>(op);
auto attributes = doOp->get_attrs();
_num_classes = attributes.num_classes;
_background_label_id = attributes.background_label_id;
_top_k = attributes.top_k;
_variance_encoded_in_target = attributes.variance_encoded_in_target;
_keep_top_k = attributes.keep_top_k[0];
_nms_threshold = attributes.nms_threshold;
_confidence_threshold = attributes.confidence_threshold;
_share_location = attributes.share_location;
_clip_before_nms = attributes.clip_before_nms;
_clip_after_nms = attributes.clip_after_nms;
_decrease_label_id = attributes.decrease_label_id;
_normalized = attributes.normalized;
_image_height = attributes.input_height;
_image_width = attributes.input_width;
_prior_size = _normalized ? 4 : 5;
_offset = _normalized ? 0 : 1;
_num_loc_classes = _share_location ? 1 : _num_classes;
with_add_box_pred = layer->insData.size() == 5;
_objectness_score = layer->GetParamAsFloat("objectness_score", 0.0f);
with_add_box_pred = op->get_input_size() == 5;
_objectness_score = attributes.objectness_score;
std::string code_type_str = layer->GetParamAsString("code_type", "caffe.PriorBoxParameter.CORNER");
_code_type = (code_type_str == "caffe.PriorBoxParameter.CENTER_SIZE" ? CodeType::CENTER_SIZE
: CodeType::CORNER);
_code_type = (details::CaselessEq<std::string>()(attributes.code_type, "caffe.PriorBoxParameter.CENTER_SIZE") ?
CodeType::CENTER_SIZE : CodeType::CORNER);
_num_priors = static_cast<int>(layer->insData[idx_priors].lock()->getDims().back() / _prior_size);
_priors_batches = layer->insData[idx_priors].lock()->getDims().front() != 1;
_num_priors = static_cast<int>(op->get_input_shape(idx_priors).back() / _prior_size);
_priors_batches = op->get_input_shape(idx_priors).front() != 1;
if (_num_priors * _num_loc_classes * 4 != static_cast<int>(layer->insData[idx_location].lock()->getDims()[1]))
if (_num_priors * _num_loc_classes * 4 != static_cast<int>(op->get_input_shape(idx_location)[1]))
IE_THROW() << "Number of priors must match number of location predictions ("
<< _num_priors * _num_loc_classes * 4 << " vs "
<< layer->insData[idx_location].lock()->getDims()[1] << ")";
<< op->get_input_shape(idx_location)[1] << ")";
if (_num_priors * _num_classes != static_cast<int>(layer->insData[idx_confidence].lock()->getTensorDesc().getDims().back()))
if (_num_priors * _num_classes != static_cast<int>(op->get_input_shape(idx_confidence).back()))
IE_THROW() << "Number of priors must match number of confidence predictions.";
if (_decrease_label_id && _background_label_id != 0)
IE_THROW() << "Cannot use decrease_label_id and background_label_id parameter simultaneously.";
_num = static_cast<int>(layer->insData[idx_confidence].lock()->getTensorDesc().getDims()[0]);
_num = static_cast<int>(op->get_input_shape(idx_confidence)[0]);
InferenceEngine::SizeVector bboxes_size{static_cast<size_t>(_num),
static_cast<size_t>(_num_classes),
static_cast<size_t>(_num_priors),
4};
_decoded_bboxes = InferenceEngine::make_shared_blob<float>({Precision::FP32, bboxes_size, NCHW});
_decoded_bboxes->allocate();
_decoded_bboxes.resize(_num * _num_classes * _num_priors * 4);
_buffer.resize(_num * _num_classes * _num_priors);
_indices.resize(_num * _num_classes * _num_priors);
_detections_count.resize(_num * _num_classes);
_bbox_sizes.resize(_num * _num_classes * _num_priors);
_num_priors_actual.resize(_num);
InferenceEngine::SizeVector buf_size{static_cast<size_t>(_num),
static_cast<size_t>(_num_classes),
static_cast<size_t>(_num_priors)};
_buffer = InferenceEngine::make_shared_blob<int>({Precision::I32, buf_size, {buf_size, {0, 1, 2}}});
_buffer->allocate();
const auto &confSize = op->get_input_shape(idx_confidence);
_reordered_conf.resize(std::accumulate(confSize.begin(), confSize.end(), 1, std::multiplies<size_t>()));
InferenceEngine::SizeVector indices_size{static_cast<size_t>(_num),
static_cast<size_t>(_num_classes),
static_cast<size_t>(_num_priors)};
_indices = InferenceEngine::make_shared_blob<int>(
{Precision::I32, indices_size, {indices_size, {0, 1, 2}}});
_indices->allocate();
InferenceEngine::SizeVector detections_size{static_cast<size_t>((size_t)(_num) * _num_classes)};
_detections_count = InferenceEngine::make_shared_blob<int>({Precision::I32, detections_size, C});
_detections_count->allocate();
const InferenceEngine::SizeVector &conf_size = layer->insData[idx_confidence].lock()->getTensorDesc().getDims();
_reordered_conf = InferenceEngine::make_shared_blob<float>({Precision::FP32, conf_size, ANY});
_reordered_conf->allocate();
InferenceEngine::SizeVector decoded_bboxes_size{static_cast<size_t>(_num),
static_cast<size_t>(_num_priors),
static_cast<size_t>(_num_classes)};
_bbox_sizes = InferenceEngine::make_shared_blob<float>(
{Precision::FP32, decoded_bboxes_size, {decoded_bboxes_size, {0, 1, 2}}});
_bbox_sizes->allocate();
InferenceEngine::SizeVector num_priors_actual_size{static_cast<size_t>(_num)};
_num_priors_actual = InferenceEngine::make_shared_blob<int>({Precision::I32, num_priors_actual_size, C});
_num_priors_actual->allocate();
std::vector<DataConfigurator> in_data_conf(layer->insData.size(), DataConfigurator(ConfLayout::PLN, Precision::FP32));
addConfig(layer, in_data_conf, {DataConfigurator(ConfLayout::PLN, Precision::FP32)});
std::vector<DataConfigurator> inDataConfigurators(op->get_input_size(), {TensorDescCreatorTypes::ncsp, Precision::FP32});
addConfig(op, inDataConfigurators,
{{TensorDescCreatorTypes::ncsp, Precision::FP32}});
} catch (InferenceEngine::Exception &ex) {
errorMsg = ex.what();
throw;
}
}
@ -131,13 +133,13 @@ public:
const int N = inputs[idx_confidence]->getTensorDesc().getDims()[0];
float *decoded_bboxes_data = _decoded_bboxes->buffer().as<float *>();
float *reordered_conf_data = _reordered_conf->buffer().as<float *>();
float *bbox_sizes_data = _bbox_sizes->buffer().as<float *>();
int *detections_data = _detections_count->buffer().as<int *>();
int *buffer_data = _buffer->buffer().as<int *>();
int *indices_data = _indices->buffer().as<int *>();
int *num_priors_actual = _num_priors_actual->buffer().as<int *>();
float *decoded_bboxes_data = _decoded_bboxes.data();
float *reordered_conf_data = _reordered_conf.data();
float *bbox_sizes_data = _bbox_sizes.data();
int *detections_data = _detections_count.data();
int *buffer_data = _buffer.data();
int *indices_data = _indices.data();
int *num_priors_actual = _num_priors_actual.data();
for (int n = 0; n < N; ++n) {
const float *ppriors = prior_data;
@ -396,13 +398,13 @@ private:
void nms_mx(const float *conf_data, const float *bboxes, const float *sizes,
int *buffer, int *indices, int *detections, int num_priors_actual);
InferenceEngine::Blob::Ptr _decoded_bboxes;
InferenceEngine::Blob::Ptr _buffer;
InferenceEngine::Blob::Ptr _indices;
InferenceEngine::Blob::Ptr _detections_count;
InferenceEngine::Blob::Ptr _reordered_conf;
InferenceEngine::Blob::Ptr _bbox_sizes;
InferenceEngine::Blob::Ptr _num_priors_actual;
std::vector<float> _decoded_bboxes;
std::vector<int> _buffer;
std::vector<int> _indices;
std::vector<int> _detections_count;
std::vector<float> _reordered_conf;
std::vector<float> _bbox_sizes;
std::vector<int> _num_priors_actual;
};
struct ConfidenceComparator {

View File

@ -12,6 +12,8 @@
#include <utility>
#include <algorithm>
#include "ie_parallel.hpp"
#include "common/tensor_desc_creator.h"
#include <ngraph/op/experimental_detectron_detection_output.hpp>
namespace {
@ -44,6 +46,8 @@ namespace InferenceEngine {
namespace Extensions {
namespace Cpu {
using MKLDNNPlugin::TensorDescCreatorTypes;
static
void refine_boxes(const float* boxes, const float* deltas, const float* weights, const float* scores,
float* refined_boxes, float* refined_boxes_areas, float* refined_scores,
@ -235,46 +239,46 @@ private:
const int OUTPUT_SCORES {2};
public:
explicit ExperimentalDetectronDetectionOutputImpl(const CNNLayer* layer) {
bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
try {
score_threshold_ = layer->GetParamAsFloat("score_threshold");
nms_threshold_ = layer->GetParamAsFloat("nms_threshold");
max_delta_log_wh_ = layer->GetParamAsFloat("max_delta_log_wh");
classes_num_ = layer->GetParamAsInt("num_classes");
max_detections_per_class_ = layer->GetParamAsInt("post_nms_count");
max_detections_per_image_ = layer->GetParamAsInt("max_detections_per_image");
class_agnostic_box_regression_ = layer->GetParamAsBool("class_agnostic_box_regression", false);
deltas_weights_ = layer->GetParamAsFloats("deltas_weights");
LayerConfig config;
for (auto in : layer->insData) {
auto in_ = in.lock();
auto dims = in_->getTensorDesc().getDims();
DataConfig data;
data.desc = TensorDesc(Precision::FP32, dims, in_->getTensorDesc().getLayoutByDims(dims));
config.inConfs.push_back(data);
auto doOp = ngraph::as_type_ptr<const ngraph::op::v6::ExperimentalDetectronDetectionOutput>(op);
if (!doOp) {
errorMessage = "Node is not an instance of the ExperimentalDetectronDetectionOutput from the operations set v6.";
return false;
}
} catch (...) {
return false;
}
return true;
}
auto dimsB = layer->outData[OUTPUT_BOXES]->getTensorDesc().getDims();
DataConfig dataB;
dataB.desc = TensorDesc(Precision::FP32, dimsB,
layer->outData[OUTPUT_BOXES]->getTensorDesc().getLayoutByDims(dimsB));
config.outConfs.push_back(dataB);
auto dimsC = layer->outData[OUTPUT_CLASSES]->getTensorDesc().getDims();
DataConfig dataC;
dataC.desc = TensorDesc(Precision::I32, dimsC,
layer->outData[OUTPUT_BOXES]->getTensorDesc().getLayoutByDims(dimsC));
config.outConfs.push_back(dataC);
auto dimsS = layer->outData[OUTPUT_SCORES]->getTensorDesc().getDims();
DataConfig dataS;
dataS.desc = TensorDesc(Precision::FP32, dimsS,
layer->outData[OUTPUT_BOXES]->getTensorDesc().getLayoutByDims(dimsS));
config.outConfs.push_back(dataS);
config.dynBatchSupport = false;
confs.push_back(config);
explicit ExperimentalDetectronDetectionOutputImpl(const std::shared_ptr<ngraph::Node>& op) {
try {
std::string errorMessage;
if (!isSupportedOperation(op, errorMessage)) {
IE_THROW(NotImplemented) << errorMessage;
}
auto doOp = ngraph::as_type_ptr<const ngraph::op::v6::ExperimentalDetectronDetectionOutput>(op);
auto attributes = doOp->get_attrs();
score_threshold_ = attributes.score_threshold;
nms_threshold_ = attributes.nms_threshold;
max_delta_log_wh_ = attributes.max_delta_log_wh;
classes_num_ = attributes.num_classes;
max_detections_per_class_ = attributes.post_nms_count;
max_detections_per_image_ = attributes.max_detections_per_image;
class_agnostic_box_regression_ = attributes.class_agnostic_box_regression;
deltas_weights_ = attributes.deltas_weights;
std::vector<DataConfigurator> inDataConfigurators(op->get_input_size(), {TensorDescCreatorTypes::ncsp, Precision::FP32});
addConfig(op, inDataConfigurators,
{{TensorDescCreatorTypes::ncsp, Precision::FP32},
{TensorDescCreatorTypes::ncsp, Precision::I32},
{TensorDescCreatorTypes::ncsp, Precision::FP32}});
} catch (InferenceEngine::Exception &ex) {
errorMsg = ex.what();
throw;
}
}

View File

@ -1,247 +0,0 @@
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "embedding_bag_sum.hpp"
#include "ie_parallel.hpp"
#include <vector>
namespace InferenceEngine {
namespace Extensions {
namespace Cpu {
class EmbeddingBagOffsetsSumImpl: public MKLDNNEmbeddingBagSum {
public:
explicit EmbeddingBagOffsetsSumImpl(const CNNLayer* layer) :
MKLDNNEmbeddingBagSum(layer, 3lu, 1lu, 4lu, 3lu) {
auto indicesData = layer->insData[INDICES_IDX].lock();
if (indicesData == nullptr)
IE_THROW() << "'" << layer->name << "' layer has nullable indices data.";
if (indicesData->getTensorDesc().getDims().size() != 1)
IE_THROW() << "'" << layer->name << "' layer has indices data with invalid shape.";
auto offsetsData = layer->insData[OFFSETS_IDX].lock();
if (offsetsData == nullptr)
IE_THROW() << "'" << layer->name << "' layer has invalid offsets data.";
if (offsetsData->getTensorDesc().getDims().size() != 1)
IE_THROW() << "'" << layer->name << "' layer's offsets data has invalid shape.";
_indicesLen = indicesData->getTensorDesc().getDims()[0];
_offsetsLen = offsetsData->getTensorDesc().getDims()[0];
}
StatusCode execute(
std::vector<Blob::Ptr>& inputs,
std::vector<Blob::Ptr>& outputs,
ResponseDesc* resp) noexcept override {
switch (inputs[0]->getTensorDesc().getPrecision()) {
case Precision::FP32: {
return processData<PrecisionTrait<Precision::FP32>::value_type>(inputs, outputs, resp);
}
case Precision::I8: {
return processData<PrecisionTrait<Precision::I8>::value_type>(inputs, outputs, resp);
}
case Precision::U8: {
return processData<PrecisionTrait<Precision::U8>::value_type>(inputs, outputs, resp);
}
case Precision::I32: {
return processData<PrecisionTrait<Precision::I32>::value_type>(inputs, outputs, resp);
}
default: {
if (resp) {
std::string errorMsg = "EmbeddingBagSum layer does not support embedding table precision '"
+ std::string(inputs[0]->getTensorDesc().getPrecision().name()) + "'";
errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
}
return GENERAL_ERROR;
}
}
}
protected:
template<typename T>
StatusCode processData(
std::vector<Blob::Ptr>& inputs,
std::vector<Blob::Ptr>& outputs,
ResponseDesc* resp) noexcept {
switch (inputs[1]->getTensorDesc().getPrecision()) {
case Precision::I32: {
return processData<T, PrecisionTrait<Precision::I32>::value_type>(inputs, outputs, resp);
}
case Precision::I64: {
return processData<T, PrecisionTrait<Precision::I64>::value_type>(inputs, outputs, resp);
}
case Precision::U64: {
return processData<T, PrecisionTrait<Precision::U64>::value_type>(inputs, outputs, resp);
}
default: {
if (resp) {
std::string errorMsg = "EmbeddingBagSum layer does not support indices precision '"
+ std::string(inputs[1]->getTensorDesc().getPrecision().name()) + "'";
errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
}
return GENERAL_ERROR;
}
}
}
template<typename T, typename I>
StatusCode processData(
std::vector<Blob::Ptr>& inputs,
std::vector<Blob::Ptr>& outputs,
ResponseDesc* resp) noexcept {
std::string errorMsg;
std::string msgPrefix = std::string("Layer EmbeddingBagOffsetsSum with name '") + _layerName + "' ";
const T* srcData = inputs[0]->cbuffer().as<const T*>() +
inputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
T* dstData = outputs[0]->buffer().as<T*>() +
outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
const I* indicesData = inputs[INDICES_IDX]->cbuffer().as<const I*>();
const I* offsetsData = inputs[OFFSETS_IDX]->cbuffer().as<const I*>();
int64_t defaultIndex = -1;
if (inputs.size() > DEFAULT_INDEX_IDX) {
defaultIndex = (int64_t)inputs[DEFAULT_INDEX_IDX]->cbuffer().as<const I*>()[0];
if (defaultIndex < 0 || defaultIndex >= _indicesLen) {
std::string msg = "Invalid default index: " + std::to_string(defaultIndex);
msg.copy(resp->msg, sizeof(resp->msg) - 1);
return GENERAL_ERROR;
}
}
const T* weightsData = nullptr;
if (_withWeights)
weightsData = inputs[PER_SAMPLE_WEIGHTS_IDX]->cbuffer().as<const T*>();
const auto& inDataDims = inputs[0]->getTensorDesc().getDims();
const size_t OUTPUT_BAGS_NUM = outputs[0]->getTensorDesc().getDims()[0];
std::function<void(size_t, const I*&, size_t&, size_t&, bool&)> get_idx =
[&](size_t embIndex, const I*& indicesRef, size_t& outSize, size_t& weightsIdx, bool& withWeights) {
if (embIndex >= _offsetsLen) {
errorMsg = msgPrefix + "has invalid embedding bag index.";
return;
}
if (offsetsData[embIndex] >= _indicesLen) {
errorMsg = msgPrefix + ". Offset value exceeds indices size in the model.\noffset: "
+ std::to_string(offsetsData[embIndex]) + "; indices size: " + std::to_string(_indicesLen);
return;
}
indicesRef = nullptr;
outSize = 0lu;
withWeights = _withWeights;
if (embIndex == _offsetsLen - 1lu)
outSize = _indicesLen - offsetsData[embIndex];
else
outSize = offsetsData[embIndex + 1lu] - offsetsData[embIndex];
if (outSize != 0lu) {
indicesRef = indicesData + offsetsData[embIndex];
} else {
// Empty or default bag
withWeights = false;
if (defaultIndex >= 0) {
indicesRef = reinterpret_cast<I*>(&defaultIndex);
outSize = 1lu;
}
return;
}
if (withWeights)
weightsIdx = offsetsData[embIndex];
};
auto threadBody = [&](const int ithr, const int nthr) {
size_t start(0lu), end(0lu);
splitter(OUTPUT_BAGS_NUM, nthr, ithr, start, end);
if (start >= end)
return;
size_t indicesSize = 0lu;
const I* indices = nullptr;
size_t weightsIdx = 0lu;
bool withWeights = _withWeights;
for (size_t obi = start; obi < end; obi++) {
size_t dstIndex = obi * _embDepth;
get_idx(obi, indices, indicesSize, weightsIdx, withWeights);
if (indices != nullptr) {
withWeights = withWeights & _withWeights;
size_t inIdx = 0lu;
if (indices[inIdx] >= inDataDims[0]) {
errorMsg = msgPrefix + "has invalid embedding bag index: " + std::to_string(indices[inIdx]);
return;
}
size_t srcIndex = indices[inIdx] * _embDepth;
if (withWeights) {
for (size_t i = 0lu; i < _embDepth; i++) {
dstData[dstIndex + i] = srcData[srcIndex + i] * weightsData[weightsIdx];
}
weightsIdx++;
} else {
for (size_t i = 0lu; i < _embDepth; i++) {
dstData[dstIndex + i] = srcData[srcIndex + i];
}
}
for (inIdx = 1lu; inIdx < indicesSize; inIdx++) {
if (indices[inIdx] >= inDataDims[0]) {
errorMsg = msgPrefix + "has invalid embedding bag index: " + std::to_string(indices[inIdx]);
return;
}
size_t srcIndex = indices[inIdx] * _embDepth;
if (withWeights) {
for (size_t i = 0lu; i < _embDepth; i++) {
dstData[dstIndex + i] += srcData[srcIndex + i] * weightsData[weightsIdx];
}
weightsIdx++;
} else {
for (size_t i = 0lu; i < _embDepth; i++) {
dstData[dstIndex + i] += srcData[srcIndex + i];
}
}
}
} else {
for (size_t i = 0lu; i < _embDepth; i++) {
dstData[dstIndex + i] = 0;
}
}
}
};
parallel_nt(0, threadBody);
if (!errorMsg.empty()) {
errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
return GENERAL_ERROR;
}
return OK;
}
void initFromInputs(std::vector<Blob::Ptr>& inputs) override {
}
void getIndices(size_t embIndex, const size_t*& indices, size_t& size, size_t& weightsIdx, bool& withWeights) override {
}
const size_t OFFSETS_IDX = 2lu;
size_t _indicesLen;
size_t _offsetsLen;
};
REG_FACTORY_FOR(EmbeddingBagOffsetsSumImpl, EmbeddingBagOffsetsSum);
} // namespace Cpu
} // namespace Extensions
} // namespace InferenceEngine

View File

@ -1,67 +0,0 @@
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "embedding_bag_sum.hpp"
#include "common/cpu_memcpy.h"
namespace InferenceEngine {
namespace Extensions {
namespace Cpu {
class EmbeddingBagPackedSumImpl: public MKLDNNEmbeddingBagSum {
public:
explicit EmbeddingBagPackedSumImpl(const CNNLayer* layer) :
MKLDNNEmbeddingBagSum(layer, 2lu, 1lu, 2lu, 3lu) {
auto indicesData = layer->insData[INDICES_IDX].lock();
if (indicesData == nullptr)
IE_THROW() << "'" << layer->name << "' layer has nullable indices data.";
if (indicesData->getTensorDesc().getDims().size() != 2)
IE_THROW() << "'" << layer->name << "' layer has indices data with invalid shape.";
_indices = std::vector<std::vector<size_t>>(
indicesData->getTensorDesc().getDims()[0],
std::vector<size_t>(indicesData->getTensorDesc().getDims()[1], 0lu));
}
void initFromInputs(std::vector<Blob::Ptr>& inputs) override {
// Initialize indices
const size_t bagsNum = inputs[INDICES_IDX]->getTensorDesc().getDims()[0];
const size_t batch = inputs[INDICES_IDX]->getTensorDesc().getDims()[1];
if (inputs[INDICES_IDX]->getTensorDesc().getPrecision().size() == sizeof(INT32)) {
const INT32* src = inputs[INDICES_IDX]->cbuffer().as<const INT32*>();
for (size_t i = 0lu; i < bagsNum; i++) {
size_t ibn = i * batch;
for (size_t j = 0lu; j < batch; j++) {
_indices[i][j] = static_cast<size_t>(src[ibn + j]);
}
}
} else if (inputs[INDICES_IDX]->getTensorDesc().getPrecision().size() == sizeof(UINT64)) {
const UINT64* src = inputs[INDICES_IDX]->cbuffer().as<const UINT64*>();
for (size_t i = 0lu; i < bagsNum; i++) {
cpu_memcpy(_indices[i].data(), src + i * batch, batch * sizeof(UINT64));
}
}
}
void getIndices(size_t embIndex, const size_t*& indices, size_t& size, size_t& weightsIdx, bool& withWeights) override {
if (embIndex >= _indices.size())
IE_THROW() << "Invalid embedding bag index.";
withWeights = true;
indices = _indices[embIndex].data();
size = _indices[0].size();
weightsIdx = embIndex * _indices[0].size();
}
protected:
std::vector<std::vector<size_t>> _indices;
};
REG_FACTORY_FOR(EmbeddingBagPackedSumImpl, EmbeddingBagPackedSum);
} // namespace Cpu
} // namespace Extensions
} // namespace InferenceEngine

View File

@ -1,209 +0,0 @@
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "embedding_bag_sum.hpp"
#include "ie_parallel.hpp"
#include "list.hpp"
#include <set>
#include <string>
#include <vector>
using namespace InferenceEngine;
using namespace InferenceEngine::Extensions::Cpu;
const std::set<size_t> MKLDNNEmbeddingBagSum::_supportedIndicesTypeSize = {sizeof(INT32), sizeof(INT64)};
MKLDNNEmbeddingBagSum::MKLDNNEmbeddingBagSum(
const CNNLayer* layer,
size_t requiredInputNum,
size_t indicesIdx,
size_t perSampleWeightsIdx,
size_t defaultIndexIdx,
const std::set<Precision>& supportedPrecisions) :
INDICES_IDX(indicesIdx),
PER_SAMPLE_WEIGHTS_IDX(perSampleWeightsIdx),
DEFAULT_INDEX_IDX(defaultIndexIdx) {
try {
std::string logPrefix = std::string("Layer EmbeddingBagSum with name '") + layer->name + "' ";
if (layer->insData.size() < requiredInputNum || layer->outData.size() != 1)
IE_THROW() << logPrefix << "has incorrect number of input or output edges!";
_layerName = layer->name;
auto inData = layer->insData[0].lock();
auto indicesData = layer->insData[INDICES_IDX].lock();
if (inData == nullptr || indicesData == nullptr)
IE_THROW() << logPrefix << "has nullable input data.";
auto dataPrecision = inData->getTensorDesc().getPrecision();
if (dataPrecision == Precision::BF16)
dataPrecision = Precision::FP32;
if (!supportedPrecisions.empty()) {
if (supportedPrecisions.find(dataPrecision) == supportedPrecisions.end())
IE_THROW() << logPrefix << "has unsupported precision: " << dataPrecision.name();
} else {
static const std::set<Precision> defaultSupportedPrecisions =
{Precision::FP32, Precision::I8, Precision::U8, Precision::I32};
if (defaultSupportedPrecisions.find(dataPrecision) == defaultSupportedPrecisions.end())
IE_THROW() << logPrefix << "has unsupported precision: " << dataPrecision.name();
}
if (layer->insData.size() > PER_SAMPLE_WEIGHTS_IDX)
_withWeights = true;
if (_withWeights) {
auto weightsData = layer->insData[PER_SAMPLE_WEIGHTS_IDX].lock();
if (weightsData == nullptr)
IE_THROW() << logPrefix << "has nullable weights data";
if (weightsData->getTensorDesc().getDims() != indicesData->getTensorDesc().getDims())
IE_THROW() << logPrefix << "must have equal shapes for indices and per_sample_weights inputs.";
}
LayerConfig config;
config.inConfs.resize(layer->insData.size());
for (int i = 0; i < layer->insData.size(); i++) {
auto data = layer->insData[i].lock();
if (data == nullptr)
IE_THROW() << logPrefix << "has nullable input data";
auto prc = data->getTensorDesc().getPrecision();
if (prc == Precision::BF16)
prc = Precision::FP32;
config.inConfs[i].desc = TensorDesc(prc,
data->getTensorDesc().getDims(),
TensorDesc::getLayoutByDims(data->getTensorDesc().getDims()));
}
DataConfig outConfig;
auto& outDims = layer->outData[0]->getTensorDesc().getDims();
outConfig.desc = TensorDesc(dataPrecision,
outDims,
TensorDesc::getLayoutByDims(outDims));
config.outConfs.push_back(outConfig);
config.dynBatchSupport = false;
confs.push_back(config);
const auto& inDataDims = inData->getTensorDesc().getDims();
_embDepth = 1lu;
for (size_t i = 1lu; i < inDataDims.size(); i++) {
_embDepth *= inDataDims[i];
}
} catch (InferenceEngine::Exception &ex) {
errorMsg = ex.what();
}
}
StatusCode MKLDNNEmbeddingBagSum::execute(
std::vector<Blob::Ptr>& inputs,
std::vector<Blob::Ptr>& outputs,
ResponseDesc *resp) noexcept {
switch (inputs[0]->getTensorDesc().getPrecision()) {
case Precision::FP32: {
processData<PrecisionTrait<Precision::FP32>::value_type>(inputs, outputs);
break;
}
case Precision::I8: {
processData<PrecisionTrait<Precision::I8>::value_type>(inputs, outputs);
break;
}
case Precision::U8: {
processData<PrecisionTrait<Precision::U8>::value_type>(inputs, outputs);
break;
}
case Precision::I32: {
processData<PrecisionTrait<Precision::I32>::value_type>(inputs, outputs);
break;
}
default: {
if (resp) {
std::string errorMsg = "EmbeddingBagSum layer does not support precision '"
+ std::string(inputs[0]->getTensorDesc().getPrecision().name()) + "'";
errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
}
return GENERAL_ERROR;
}
}
return OK;
}
template<typename T>
void MKLDNNEmbeddingBagSum::processData(
std::vector<Blob::Ptr>& inputs,
std::vector<Blob::Ptr>& outputs) noexcept {
const T* srcData = inputs[0]->cbuffer().as<const T*>() +
inputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
T* dstData = outputs[0]->buffer().as<T*>() +
outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
const T* weightsData = nullptr;
if (_withWeights)
weightsData = inputs[PER_SAMPLE_WEIGHTS_IDX]->cbuffer().as<const T*>();
initFromInputs(inputs);
const auto& inDataDims = inputs[0]->getTensorDesc().getDims();
const size_t outputBagsNum = outputs[0]->getTensorDesc().getDims()[0];
auto threadBody = [&](const int ithr, const int nthr) {
size_t start(0lu), end(0lu);
splitter(outputBagsNum, nthr, ithr, start, end);
if (start >= end)
return;
size_t indicesSize = 0lu;
const size_t* indices = nullptr;
size_t weightsIdx = 0lu;
bool withWeights = _withWeights;
for (size_t obi = start; obi < end; obi++) {
size_t dstIndex = obi * _embDepth;
getIndices(obi, indices, indicesSize, weightsIdx, withWeights);
if (indices != nullptr) {
withWeights = withWeights & _withWeights;
size_t inIdx = 0lu;
if (indices[inIdx] >= inDataDims[0])
IE_THROW() << "EmbeddingBagSum layer '" << _layerName
<< "' has invalid embedding bag index: " << indices[inIdx];
size_t srcIndex = indices[inIdx] * _embDepth;
if (withWeights) {
for (size_t i = 0lu; i < _embDepth; i++) {
dstData[dstIndex + i] = srcData[srcIndex + i] * weightsData[weightsIdx];
}
weightsIdx++;
} else {
for (size_t i = 0lu; i < _embDepth; i++) {
dstData[dstIndex + i] = srcData[srcIndex + i];
}
}
for (inIdx = 1lu; inIdx < indicesSize; inIdx++) {
if (indices[inIdx] >= inDataDims[0])
IE_THROW() << "EmbeddingBagSum layer '" << _layerName
<< "' has invalid embedding bag index: " << indices[inIdx];
size_t srcIndex = indices[inIdx] * _embDepth;
if (withWeights) {
for (size_t i = 0lu; i < _embDepth; i++) {
dstData[dstIndex + i] += srcData[srcIndex + i] * weightsData[weightsIdx];
}
weightsIdx++;
} else {
for (size_t i = 0lu; i < _embDepth; i++) {
dstData[dstIndex + i] += srcData[srcIndex + i];
}
}
}
} else {
for (size_t i = 0lu; i < _embDepth; i++) {
dstData[dstIndex + i] = 0;
}
}
}
};
parallel_nt(0, threadBody);
}

View File

@ -1,63 +0,0 @@
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "base.hpp"
#include <memory>
#include <set>
#include <vector>
namespace InferenceEngine {
namespace Extensions {
namespace Cpu {
class MKLDNNEmbeddingBagSum : public ExtLayerBase {
public:
MKLDNNEmbeddingBagSum(
const CNNLayer* layer,
size_t requiredInputsNum,
size_t indicesIdx,
size_t perSampleWeightsIdx,
size_t defaultIndexIdx,
const std::set<Precision>& supportedPrecisions = {});
StatusCode execute(
std::vector<Blob::Ptr>& inputs,
std::vector<Blob::Ptr>& outputs,
ResponseDesc *resp) noexcept override;
protected:
virtual void initFromInputs(std::vector<Blob::Ptr>& inputs) = 0;
virtual void getIndices(
size_t embIndex,
const size_t*& indicesRef,
size_t& size,
size_t& weightsIdx,
bool& withWeights) = 0;
template<typename T>
void processData(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs) noexcept;
std::set<Precision> _supportedPrecisions;
const size_t INDICES_IDX;
const size_t PER_SAMPLE_WEIGHTS_IDX;
const size_t DEFAULT_INDEX_IDX;
bool _withWeights = false;
size_t _embDepth = 0;
std::string _layerName;
using INT32 = PrecisionTrait<Precision::I32>::value_type;
using INT64 = PrecisionTrait<Precision::I64>::value_type;
using UINT64 = PrecisionTrait<Precision::U64>::value_type;
static const std::set<size_t> _supportedIndicesTypeSize;
};
} // namespace Cpu
} // namespace Extensions
} // namespace InferenceEngine

View File

@ -1,134 +0,0 @@
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "embedding_bag_sum.hpp"
#include "common/cpu_memcpy.h"
namespace InferenceEngine {
namespace Extensions {
namespace Cpu {
class EmbeddingSegmentsSumImpl: public MKLDNNEmbeddingBagSum {
public:
explicit EmbeddingSegmentsSumImpl(const CNNLayer* layer) :
MKLDNNEmbeddingBagSum(layer, 4lu, 1lu, 5lu, 4lu) {
std::string errPrefix = std::string("EmbeddingSegmentsSum layer with name '") + _layerName + "' ";
auto indicesData = layer->insData[INDICES_IDX].lock();
if (indicesData == nullptr)
IE_THROW() << errPrefix << "has nullable indices data.";
if (indicesData->getTensorDesc().getDims().size() != 1)
IE_THROW() << errPrefix << "has indices data with invalid shape: "
<< indicesData->getTensorDesc().getDims().size();
auto segmentIdData = layer->insData[SEGMENT_ID_IDX].lock();
if (segmentIdData == nullptr)
IE_THROW() << errPrefix << "has invalid segmentID data.";
if (segmentIdData->getTensorDesc().getDims().size() != 1)
IE_THROW() << errPrefix << "has invalid segmentID data shape: "
<< segmentIdData->getTensorDesc().getDims().size();
auto numSegmentData = layer->insData[NUM_SEGMENTS_IDX].lock();
if (numSegmentData == nullptr)
IE_THROW() << errPrefix << "has nullable numSegmentID data.";
if (_supportedIndicesTypeSize.find(indicesData->getTensorDesc().getPrecision().size())
== _supportedIndicesTypeSize.end()
|| _supportedIndicesTypeSize.find(segmentIdData->getTensorDesc().getPrecision().size())
== _supportedIndicesTypeSize.end()
|| _supportedIndicesTypeSize.find(numSegmentData->getTensorDesc().getPrecision().size())
== _supportedIndicesTypeSize.end())
IE_THROW() << errPrefix << "has unsupported input data type.";
_indices = std::vector<size_t>(indicesData->getTensorDesc().getDims()[0], 0lu);
_segmentIds = std::vector<size_t>(segmentIdData->getTensorDesc().getDims()[0], 0lu);
}
void initFromInputs(std::vector<Blob::Ptr>& inputs) override {
// Initialize indices
if (inputs[INDICES_IDX]->getTensorDesc().getPrecision().size() == sizeof(INT32)) {
const INT32* src = inputs[INDICES_IDX]->cbuffer().as<const INT32*>();
for (size_t i = 0lu; i < inputs[INDICES_IDX]->size(); i++)
_indices[i] = static_cast<size_t>(src[i]);
} else if (inputs[INDICES_IDX]->getTensorDesc().getPrecision().size() == sizeof(UINT64)) {
const UINT64* src = inputs[INDICES_IDX]->cbuffer().as<const UINT64*>();
cpu_memcpy(_indices.data(), src, inputs[INDICES_IDX]->byteSize());
}
// Initialize segments ids
if (inputs[SEGMENT_ID_IDX]->getTensorDesc().getPrecision().size() == sizeof(INT32)) {
const INT32* src = inputs[SEGMENT_ID_IDX]->cbuffer().as<const INT32*>();
for (size_t i = 0lu; i < inputs[SEGMENT_ID_IDX]->size(); i++)
_segmentIds[i] = static_cast<size_t>(src[i]);
} else if (inputs[SEGMENT_ID_IDX]->getTensorDesc().getPrecision().size() == sizeof(UINT64)) {
const UINT64* src = inputs[SEGMENT_ID_IDX]->cbuffer().as<const UINT64*>();
cpu_memcpy(_segmentIds.data(), src, inputs[SEGMENT_ID_IDX]->byteSize());
}
if (inputs.size() > NUM_SEGMENTS_IDX) {
if (inputs[NUM_SEGMENTS_IDX]->getTensorDesc().getPrecision().size() == sizeof(INT32)) {
const INT32* src = inputs[NUM_SEGMENTS_IDX]->cbuffer().as<const INT32*>();
_numSegments = static_cast<size_t>(*src);
} else if (inputs[NUM_SEGMENTS_IDX]->getTensorDesc().getPrecision().size() == sizeof(UINT64)) {
const INT64* src = inputs[NUM_SEGMENTS_IDX]->cbuffer().as<const INT64*>();
_numSegments = *src;
}
}
// Initialize default index
_defaultIndices.clear();
if (inputs.size() > DEFAULT_INDEX_IDX) {
if (inputs[DEFAULT_INDEX_IDX]->getTensorDesc().getPrecision().size() == sizeof(INT32)) {
const INT32* src = inputs[DEFAULT_INDEX_IDX]->cbuffer().as<const INT32*>();
_defaultIndices.push_back(static_cast<size_t>(*src));
} else if (inputs[DEFAULT_INDEX_IDX]->getTensorDesc().getPrecision().size() == sizeof(UINT64)) {
const INT64* src = inputs[DEFAULT_INDEX_IDX]->cbuffer().as<const INT64*>();
_defaultIndices.push_back(*src);
}
}
}
void getIndices(size_t embIndex, const size_t*& indices, size_t& size, size_t& weightsIdx, bool& withWeight) override {
if (embIndex >= _numSegments)
IE_THROW() << "Invalid embedding bag index.";
indices = nullptr;
size = 0lu;
withWeight = true;
for (size_t si = 0; si < _indices.size(); si++) {
if (_segmentIds[si] == embIndex) {
size++;
if (indices == nullptr) {
indices = _indices.data() + si;
weightsIdx = si;
}
}
}
// Empty bag
if (size == 0) {
size = 1lu;
withWeight = false;
if (_defaultIndices.size() == 1lu)
indices = _defaultIndices.data();
return;
}
}
protected:
const size_t SEGMENT_ID_IDX = 2lu;
const size_t NUM_SEGMENTS_IDX = 3lu;
size_t _numSegments = 0lu;
std::vector<size_t> _indices;
std::vector<size_t> _segmentIds;
std::vector<size_t> _defaultIndices;
};
REG_FACTORY_FOR(EmbeddingSegmentsSumImpl, EmbeddingSegmentsSum);
} // namespace Cpu
} // namespace Extensions
} // namespace InferenceEngine

View File

@ -10,6 +10,9 @@
#include <cstring>
#include <string>
#include <cmath>
#include <ngraph/opsets/opset3.hpp>
using namespace MKLDNNPlugin;
namespace InferenceEngine {
namespace Extensions {
@ -267,41 +270,65 @@ private:
}
};
ExtractImagePatchesImpl::ExtractImagePatchesImpl(const CNNLayer* layer) {
bool ExtractImagePatchesImpl::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
try {
std::string errorPrefix = std::string("Layer ") + layer->type + " with name '" + layer->name + "' ";
if (details::CaselessEq<std::string>()("ExtractImagePatchesLayer", layer->type))
IE_THROW() << errorPrefix << "is not an instance of ExtractImagePatchesLayer class";
const auto extImgPatcher = std::dynamic_pointer_cast<const ngraph::opset3::ExtractImagePatches>(op);
if (!extImgPatcher) {
errorMessage = "Only opset3 ExtractImagePatches operation is supported";
return false;
}
const auto padValue = extImgPatcher->get_auto_pad();
if (!one_of(padValue, ngraph::op::PadType::VALID, ngraph::op::PadType::SAME_LOWER, ngraph::op::PadType::SAME_UPPER)) {
errorMessage = "Does not support pad type: " + ngraph::as_string(padValue);
return false;
}
if (!everyone_is(2, extImgPatcher->get_sizes().size(), extImgPatcher->get_strides().size(), extImgPatcher->get_rates().size())) {
errorMessage = "Doesn't support 'sizes', 'strides', 'rates', attributes with rank != 2";
return false;
}
} catch (...) {
return false;
}
return true;
}
if (layer->insData.size() != 1 || layer->outData.size() != 1)
IE_THROW() << errorPrefix << "has incorrect number of input or output edges!"
<< " Input: " << layer->insData.size() << "; Output: " << layer->outData.size();
ExtractImagePatchesImpl::ExtractImagePatchesImpl(const std::shared_ptr<ngraph::Node>& op) {
try {
std::string errorMessage;
if (!isSupportedOperation(op, errorMessage)) {
IE_THROW(NotImplemented) << errorMessage;
}
auto inData = layer->insData[0].lock();
if (inData == nullptr)
IE_THROW() << errorPrefix << "has nullable input data";
errorPrefix = "ExtractImagePatches layer with name '" + op->get_friendly_name() + "' ";
const auto extImgPatcher = std::dynamic_pointer_cast<const ngraph::opset3::ExtractImagePatches>(op);
if (inData->getTensorDesc().getDims().size() != 4)
IE_THROW() << errorPrefix << "must have 4D input tensor. Actual: " << inData->getTensorDesc().getDims().size();
if (op->get_input_size() != 1 || op->get_output_size() != 1)
IE_THROW() << errorPrefix << "has incorrect number of input or output edges!"
<< " Input: " << op->get_input_size() << "; Output: " << op->get_output_size();
if (layer->outData[0]->getTensorDesc().getDims().size() != 4)
IE_THROW() << errorPrefix << "must have 4D output tensor. Actual: " << layer->outData[0]->getTensorDesc().getDims().size();
if (op->get_input_shape(0).size() != 4)
IE_THROW() << errorPrefix << "must have 4D input tensor. Actual: " << op->get_input_shape(0).size();
if (inData->getLayout() != NCHW)
IE_THROW() << errorPrefix << "has unsupported layout: " << inData->getLayout();
if (op->get_output_shape(0).size() != 4)
IE_THROW() << errorPrefix << "must have 4D output tensor. Actual: " << op->get_output_shape(0).size();
const auto precision = inData->getTensorDesc().getPrecision();
if (_supported_precisions_sizes.find(precision.size()) == _supported_precisions_sizes.end())
IE_THROW() << errorPrefix << "has unsupported precision: " << precision.name();
const auto precision = details::convertPrecision(op->get_input_element_type(0));
if (_supported_precisions_sizes.find(precision.size()) == _supported_precisions_sizes.end())
IE_THROW() << errorPrefix << "has unsupported precision: " << precision.name();
auto ksizes = extImgPatcher->get_sizes();
auto strides = extImgPatcher->get_strides();
auto rates = extImgPatcher->get_rates();
if (extImgPatcher->get_auto_pad() == ngraph::op::PadType::VALID) {
_auto_pad = ExtImgPatcherPadType::VALID;
} else if (extImgPatcher->get_auto_pad() == ngraph::op::PadType::SAME_LOWER) {
_auto_pad = ExtImgPatcherPadType::SAME_LOWER;
} else if (extImgPatcher->get_auto_pad() == ngraph::op::PadType::SAME_UPPER) {
_auto_pad = ExtImgPatcherPadType::SAME_UPPER;
} else {
IE_THROW() << errorPrefix << "has unsupported pad type: " << extImgPatcher->get_auto_pad();
}
auto ksizes = layer->GetParamAsUInts("sizes");
auto strides = layer->GetParamAsUInts("strides");
auto rates = layer->GetParamAsUInts("rates");
std::string auto_pad = layer->GetParamAsString("auto_pad");
if (!CaselessEq<std::string>()(auto_pad, "valid")
&& !CaselessEq<std::string>()(auto_pad, "same_upper")
&& !CaselessEq<std::string>()(auto_pad, "same_lower"))
IE_THROW() << errorPrefix << "has unsupported auto_pad value: " << auto_pad;
if (ksizes.size() != 2 || strides.size() != 2 || rates.size() != 2)
IE_THROW() << errorPrefix << "must have the following attributes with shape {2}: sizes, strides, rates.";
_ksizes.clear();
@ -323,12 +350,12 @@ ExtractImagePatchesImpl::ExtractImagePatchesImpl(const CNNLayer* layer) {
_rates.push_back(static_cast<size_t>(x));
}
SizeVector in_dims = inData->getTensorDesc().getDims();
SizeVector in_dims = op->get_input_shape(0);
_pad_left = 0;
_pad_top = 0;
jit_extract_image_patches_params jpp;
jpp.need_padding = false;
if (!CaselessEq<std::string>()(auto_pad, "valid")) {
if (_auto_pad != ExtImgPatcherPadType::VALID) {
const size_t iheight = in_dims[2];
const size_t iwidth = in_dims[3];
const int64_t ihStep = _ksizes[0] + (_rates[0] - 1) * (_ksizes[0] - 1);
@ -338,9 +365,9 @@ ExtractImagePatchesImpl::ExtractImagePatchesImpl(const CNNLayer* layer) {
int64_t PH = (std::ceil(1.f * iheight/_strides[0]) - 1) * _strides[0] + ihStep - iheight;
int64_t increment_sign = 0;
if (CaselessEq<std::string>()(auto_pad, "same_lower")) {
if (_auto_pad == ExtImgPatcherPadType::SAME_LOWER) {
increment_sign = 1;
} else if (CaselessEq<std::string>()(auto_pad, "same_upper")) {
} else if (_auto_pad == ExtImgPatcherPadType::SAME_UPPER) {
increment_sign = -1;
}
@ -355,14 +382,14 @@ ExtractImagePatchesImpl::ExtractImagePatchesImpl(const CNNLayer* layer) {
}
jpp.IW = in_dims[3];
SizeVector out_dims = layer->outData[0]->getTensorDesc().getDims();
SizeVector out_dims = op->get_output_shape(0);
jpp.OH = out_dims[2];
jpp.OW = out_dims[3];
jpp.KH = _ksizes[0];
jpp.KW = _ksizes[1];
jpp.SH = _strides[0];
jpp.SW = _strides[1];
jpp.dtype_size = layer->insData.front().lock()->getPrecision().size();
jpp.dtype_size = precision.size();
jpp.block_size = 1;
if (mayiuse(x64::avx512_common)) {
@ -379,26 +406,13 @@ ExtractImagePatchesImpl::ExtractImagePatchesImpl(const CNNLayer* layer) {
if (extract_image_patches_kernel)
extract_image_patches_kernel->create_ker();
LayerConfig config;
DataConfig inConfig;
inConfig.desc = inData->getTensorDesc();
config.inConfs.push_back(inConfig);
DataConfig outConfig;
outConfig.desc = layer->outData[0]->getTensorDesc();
outConfig.desc.setPrecision(inConfig.desc.getPrecision());
outConfig.desc.setLayout(inConfig.desc.getLayout());
config.outConfs.push_back(outConfig);
config.dynBatchSupport = false;
confs.push_back(config);
addConfig(op, {{TensorDescCreatorTypes::ncsp, precision}},
{{TensorDescCreatorTypes::ncsp, precision}});
} catch (InferenceEngine::Exception &ex) {
errorMsg = ex.what();
}
}
StatusCode ExtractImagePatchesImpl::execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, ResponseDesc *resp) noexcept {
const char *src_data = inputs[0]->cbuffer().as<const char *>() +
inputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();

View File

@ -42,10 +42,17 @@ struct jit_uni_extract_image_patches_kernel {
class ExtractImagePatchesImpl : public ExtLayerBase {
public:
explicit ExtractImagePatchesImpl(const CNNLayer*);
explicit ExtractImagePatchesImpl(const std::shared_ptr<ngraph::Node>& op);
StatusCode execute(std::vector<Blob::Ptr>&, std::vector<Blob::Ptr>&, ResponseDesc*) noexcept override;
bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
private:
enum class ExtImgPatcherPadType {
VALID,
SAME_LOWER,
SAME_UPPER
};
std::vector<size_t> _ksizes;
std::vector<size_t> _strides;
std::vector<size_t> _rates;
@ -53,6 +60,10 @@ private:
size_t _pad_top;
std::shared_ptr<jit_uni_extract_image_patches_kernel> extract_image_patches_kernel;
static const std::set<size_t> _supported_precisions_sizes;
ExtImgPatcherPadType _auto_pad;
std::string errorPrefix;
};
REG_FACTORY_FOR(ExtractImagePatchesImpl, ExtractImagePatches);

View File

@ -1,124 +0,0 @@
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "base.hpp"
#include <cmath>
#include <string>
#include <vector>
#include <cassert>
#include "ie_parallel.hpp"
namespace InferenceEngine {
namespace Extensions {
namespace Cpu {
class FillImpl: public ExtLayerBase {
public:
explicit FillImpl(const CNNLayer* layer) {
try {
if (layer->insData.empty() || layer->outData.empty())
IE_THROW() << layer->name << " Incorrect number of input/output edges!";
if (layer->insData.size() != 2)
IE_THROW() << layer->name << " Incorrect number of input edges!";
SizeVector fill_dims = layer->insData[FILL_DIMS].lock()->getTensorDesc().getDims();
if (fill_dims.size() > 1)
IE_THROW() << layer->name << " Fill dimensions vector should be 1 dimension";
SizeVector value_dims = layer->insData[FILL_VALUE].lock()->getTensorDesc().getDims();
if (value_dims.size() > 1)
IE_THROW() << layer->name << " Value scalar should have 1 dimension";
if (!(layer->insData[FILL_VALUE].lock()->getTensorDesc().getPrecision() == Precision::I32 &&
layer->outData[0]->getTensorDesc().getPrecision() == Precision::I32) &&
!(layer->insData[FILL_VALUE].lock()->getTensorDesc().getPrecision() == Precision::FP32 &&
layer->outData[0]->getTensorDesc().getPrecision() == Precision::FP32)) {
addConfig(layer, { DataConfigurator(ConfLayout::PLN, Precision::I32), DataConfigurator(ConfLayout::PLN, Precision::FP32) },
{ DataConfigurator(ConfLayout::PLN, Precision::FP32) });
} else {
addConfig(layer, { DataConfigurator(ConfLayout::PLN, Precision::I32), DataConfigurator(ConfLayout::PLN) },
{ DataConfigurator(ConfLayout::PLN) });
}
} catch (InferenceEngine::Exception &ex) {
errorMsg = ex.what();
}
}
StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, ResponseDesc *resp) noexcept override {
int32_t* fill_dims = inputs[FILL_DIMS]->cbuffer().as<int32_t *>() +
inputs[FILL_DIMS]->getTensorDesc().getBlockingDesc().getOffsetPadding();
size_t fill_size = inputs[FILL_DIMS]->getTensorDesc().getDims()[0];
SizeVector dst_dims = outputs[0]->getTensorDesc().getDims();
if (dst_dims.size() != fill_size) {
if (resp) {
std::string errorMsg = "Output tensor dimension mismatch";
errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
}
return PARAMETER_MISMATCH;
}
size_t work_amount_dst = 1;
for (size_t i = 0; i < dst_dims.size(); i++) {
work_amount_dst *= fill_dims[i];
if (static_cast<int>(dst_dims[i]) != fill_dims[i]) {
if (resp) {
std::string errorMsg = "Output tensor dimension size mismatch";
errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
}
return PARAMETER_MISMATCH;
}
}
switch (outputs[0]->getTensorDesc().getPrecision()) {
case Precision::FP32: {
float* dst_data = outputs[0]->cbuffer().as<float *>() +
outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
float value = (inputs[FILL_VALUE]->cbuffer().as<float *>() +
inputs[FILL_VALUE]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0];
parallel_nt(0, [&](const int ithr, const int nthr) {
size_t start = 0, end = 0;
splitter(work_amount_dst, nthr, ithr, start, end);
std::fill_n(dst_data + start, end - start, value);
});
}
break;
case Precision::I32: {
int32_t* dst_data = outputs[0]->cbuffer().as<int32_t *>() +
outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
int32_t value = (inputs[FILL_VALUE]->cbuffer().as<int32_t *>() +
inputs[FILL_VALUE]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0];
parallel_nt(0, [&](const int ithr, const int nthr) {
size_t start = 0, end = 0;
splitter(work_amount_dst, nthr, ithr, start, end);
std::fill_n(dst_data + start, end - start, value);
});
return OK;
}
break;
default:
if (resp) {
std::string errorMsg = "Incorrect output precision. Only FP32 and I32 are supported!";
errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
}
return GENERAL_ERROR;
}
return OK;
}
private:
const size_t FILL_DIMS = 0;
const size_t FILL_VALUE = 1;
};
REG_FACTORY_FOR(FillImpl, Fill);
} // namespace Cpu
} // namespace Extensions
} // namespace InferenceEngine

View File

@ -1,154 +0,0 @@
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "base.hpp"
#include <cmath>
#include <string>
#include <vector>
#include <cassert>
#include <algorithm>
#include <limits>
#include "ie_parallel.hpp"
#include "common/cpu_memcpy.h"
#include "common/fp16_utils.h"
namespace InferenceEngine {
namespace Extensions {
namespace Cpu {
class GatherImpl: public ExtLayerBase {
public:
explicit GatherImpl(const CNNLayer* layer) {
try {
if (layer->insData.size() != 2 || layer->outData.empty())
IE_THROW() << layer->name << " Incorrect number of input/output edges!";
Precision inIdxPrecision = layer->insData[GATHER_INDEXES].lock()->getTensorDesc().getPrecision();
if (inIdxPrecision != Precision::FP32 && inIdxPrecision != Precision::I32 && inIdxPrecision != Precision::FP16)
inIdxPrecision = Precision::I32;
axis = layer->GetParamAsInt("axis");
const SizeVector& dictionary_dims = layer->insData[GATHER_DICTIONARY].lock()->getTensorDesc().getDims();
if (dictionary_dims.size() == 0)
IE_THROW() << layer->name << " Incorrect input parameters dimension!";
// Dictionary must be at least rank axis + 1
IE_ASSERT(-static_cast<int>(dictionary_dims.size()) <= axis && axis < static_cast<int>(dictionary_dims.size()))
<< layer->name << " Incorrect input parameters dimensions and axis number!";
if (axis < 0)
axis += dictionary_dims.size();
// Find number of dictionaries, index range and data length
for (int i = 0; i < axis; i++)
numDictionaries *= dictionary_dims[i];
indexRange = dictionary_dims[axis];
for (size_t i = axis + 1; i < dictionary_dims.size(); i++)
dataLength *= dictionary_dims[i];
if (dataLength == 0)
IE_THROW() << layer->name << " Incorrect input parameters dimension!";
LayerConfig config;
DataConfig dataConfigIdx, dataConfigDct;
Precision dataPrecision = layer->insData[GATHER_DICTIONARY].lock()->getTensorDesc().getPrecision();
dataConfigDct.desc = TensorDesc(dataPrecision, dictionary_dims,
layer->insData[GATHER_DICTIONARY].lock()->getTensorDesc().getLayoutByDims(dictionary_dims));
config.inConfs.push_back(dataConfigDct);
const SizeVector& indexes_dims = layer->insData[GATHER_INDEXES].lock()->getTensorDesc().getDims();
dataConfigIdx.desc = TensorDesc(inIdxPrecision, indexes_dims,
layer->insData[GATHER_INDEXES].lock()->getTensorDesc().getLayout());
config.inConfs.push_back(dataConfigIdx);
DataConfig dataConfigOut;
const SizeVector& out_dims = layer->outData[0]->getTensorDesc().getDims();
dataConfigOut.desc = TensorDesc(dataPrecision, out_dims,
layer->outData[0]->getTensorDesc().getLayoutByDims(out_dims));
config.outConfs.push_back(dataConfigOut);
config.dynBatchSupport = false;
confs.push_back(config);
} catch (InferenceEngine::Exception &ex) {
errorMsg = ex.what();
}
}
struct f32toUi32 {
inline unsigned int operator()(const float value) {
return static_cast<unsigned int>(value);
}
};
struct f16toUi32 {
inline unsigned int operator()(const ie_fp16 value) {
return static_cast<unsigned int>(f16tof32(value));
}
};
struct i32toUi32 {
inline unsigned int operator()(const int32_t value) {
return static_cast<unsigned int>(value);
}
};
StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, ResponseDesc *resp) noexcept override {
switch (inputs[GATHER_INDEXES]->getTensorDesc().getPrecision()) {
case Precision::FP32:
gather<float, f32toUi32>(inputs[GATHER_INDEXES], inputs[GATHER_DICTIONARY], outputs[0]);
break;
case Precision::FP16:
gather<ie_fp16, f16toUi32>(inputs[GATHER_INDEXES], inputs[GATHER_DICTIONARY], outputs[0]);
break;
case Precision::I32:
gather<int32_t, i32toUi32>(inputs[GATHER_INDEXES], inputs[GATHER_DICTIONARY], outputs[0]);
break;
default:
return GENERAL_ERROR;
}
return OK;
}
private:
template <typename index_t, class Conversion>
void gather(Blob::Ptr indexes, Blob::Ptr dictionary, Blob::Ptr output) {
size_t src_indexSize = indexes->size();
const index_t *src_index = indexes->cbuffer().as<const index_t *>() + indexes->getTensorDesc().getBlockingDesc().getOffsetPadding();
const uint8_t *src_dataDict = dictionary->cbuffer().as<const uint8_t *>() + dictionary->getTensorDesc().getBlockingDesc().getOffsetPadding();
uint8_t *dst_data = output->cbuffer().as<uint8_t*>() + output->getTensorDesc().getBlockingDesc().getOffsetPadding();
size_t len = dataLength * dictionary->getTensorDesc().getPrecision().size();
parallel_for(src_indexSize, [&](size_t i) {
unsigned int idx = Conversion()(src_index[i]);
// Index clipping
if (idx < indexRange) {
// Copying data to destination from Dictionary
for (size_t j = 0; j < numDictionaries; j++) {
cpu_memcpy_s(&dst_data[len * (i + j * src_indexSize)],
output->byteSize() - (len * (i + j * src_indexSize)),
&src_dataDict[len * (idx + j * indexRange)],
len);
}
} else {
for (size_t j = 0; j < numDictionaries; j++) {
memset(&dst_data[len * (i + j * src_indexSize)], 0, len);
}
}
});
}
int axis = 0;
size_t numDictionaries = 1;
size_t indexRange = 0;
size_t dataLength = 1;
const size_t GATHER_DICTIONARY = 0;
const size_t GATHER_INDEXES = 1;
};
REG_FACTORY_FOR(GatherImpl, Gather);
} // namespace Cpu
} // namespace Extensions
} // namespace InferenceEngine

View File

@ -1,149 +0,0 @@
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "base.hpp"
#include <string>
#include <vector>
#include "ie_parallel.hpp"
namespace InferenceEngine {
namespace Extensions {
namespace Cpu {
class GatherElementsImpl: public ExtLayerBase {
public:
explicit GatherElementsImpl(const CNNLayer* layer) : strideAx1Diff_(0) {
errorPrefix_ = std::string("Layer GatherElements with name '") + layer->name + "'";
if (layer->insData.size() != 2 || layer->outData.size() != 1)
IE_THROW() << errorPrefix_ << " has invalid number of input/output edges.";
auto inputData = layer->insData[dataIndex_].lock();
auto indices = layer->insData[indicesIndex_].lock();
if (!inputData || !indices)
IE_THROW() << errorPrefix_ << " has nullable inputs.";
const auto& dataDims = inputData->getTensorDesc().getDims();
const auto& indicesDims = indices->getTensorDesc().getDims();
if (dataDims.size() != indicesDims.size())
IE_THROW() << errorPrefix_ << " has invalid input shapes. Inputs 'Data' and 'Indices' must have equal ranks.";
Precision dataPrecision = inputData->getTensorDesc().getPrecision();
if (dataPrecision.size() != sizeof(PrecisionTrait<Precision::I32>::value_type) &&
dataPrecision.size() != sizeof(PrecisionTrait<Precision::I16>::value_type) &&
dataPrecision.size() != sizeof(PrecisionTrait<Precision::I8>::value_type)) {
IE_THROW() << errorPrefix_ << " has unsupported 'inputData' input precision: " << dataPrecision;
}
Precision indicesPrecision = indices->getTensorDesc().getPrecision();
if (indicesPrecision != Precision::I32) {
IE_THROW() << errorPrefix_ << " has unsupported 'indices' input precision: " << indicesPrecision;
}
dataTypeSize_ = dataPrecision.size();
int axis = layer->GetParamAsInt("axis");
if (axis < 0)
axis += dataDims.size();
if (axis < 0 || axis >= static_cast<int>(dataDims.size()))
IE_THROW() << errorPrefix_ << " has invalid axis attribute: " << axis;
axis_ = axis;
auto& outputData = layer->outData[0];
strideAxDst_ = outputData->getTensorDesc().getBlockingDesc().getStrides()[axis_];
dstAxDim_ = outputData->getTensorDesc().getDims()[axis_];
if (axis_ > 0) {
strideAx1Diff_ = inputData->getTensorDesc().getBlockingDesc().getStrides()[axis_ - 1] -
outputData->getTensorDesc().getBlockingDesc().getStrides()[axis_ - 1];
}
LayerConfig config;
DataConfig dataConfig, indicesConfig, outConfig;
dataConfig.desc = TensorDesc(dataPrecision, dataDims,
inputData->getTensorDesc().getLayoutByDims(dataDims));
config.inConfs.push_back(dataConfig);
indicesConfig.desc = TensorDesc(Precision::I32, indicesDims,
indices->getTensorDesc().getLayoutByDims(indicesDims));
config.inConfs.push_back(indicesConfig);
const auto& outDims = outputData->getTensorDesc().getDims();
outConfig.desc = TensorDesc(dataPrecision, outDims,
outputData->getTensorDesc().getLayoutByDims(outDims));
config.outConfs.push_back(outConfig);
config.dynBatchSupport = false;
confs.push_back(config);
}
StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, ResponseDesc *resp) noexcept override {
switch (dataTypeSize_) {
case sizeof(PrecisionTrait<Precision::I32>::value_type):
return directExecution<PrecisionTrait<Precision::I32>::value_type>(inputs, outputs, resp);
case sizeof(PrecisionTrait<Precision::I16>::value_type):
return directExecution<PrecisionTrait<Precision::I16>::value_type>(inputs, outputs, resp);
case sizeof(PrecisionTrait<Precision::I8>::value_type):
return directExecution<PrecisionTrait<Precision::I8>::value_type>(inputs, outputs, resp);
default:
std::string errMsg = errorPrefix_ + " has inputData input with unsupported precision: " +
inputs[dataIndex_]->getTensorDesc().getPrecision().name();
errMsg.copy(resp->msg, sizeof(resp->msg) - 1);
return GENERAL_ERROR;
}
}
protected:
template <typename dataType>
StatusCode directExecution(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, ResponseDesc *resp) noexcept {
const dataType* srcData = inputs[dataIndex_]->cbuffer().as<const dataType*>() +
inputs[dataIndex_]->getTensorDesc().getBlockingDesc().getOffsetPadding();
const int* indices = inputs[indicesIndex_]->cbuffer().as<const int*>() +
inputs[indicesIndex_]->getTensorDesc().getBlockingDesc().getOffsetPadding();
dataType* dstData = outputs[0]->buffer().as<dataType*>() +
outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
const int outSize = outputs[0]->size();
auto threadBody = [&](const int ithr, const int nthr) {
int start(0lu), end(0lu);
splitter(outSize, nthr, ithr, start, end);
if (start >= end)
return;
int axStrideIt = start % strideAxDst_;
int dstAxIdx = (start / strideAxDst_) % dstAxDim_;
int dstShift0 = (start / strideAxDst_ / dstAxDim_) * strideAx1Diff_;
for (size_t o = start; o < end; o++, axStrideIt++) {
if (axStrideIt == strideAxDst_) {
axStrideIt = 0;
dstAxIdx++;
if (dstAxIdx == dstAxDim_) {
dstAxIdx = 0;
dstShift0 += strideAx1Diff_;
}
}
dstData[o] = srcData[o + dstShift0 + (indices[o] - dstAxIdx) * strideAxDst_];
}
};
parallel_nt(0, threadBody);
return OK;
}
const size_t dataIndex_ = 0;
const size_t indicesIndex_ = 1;
size_t axis_;
size_t dataTypeSize_;
int strideAxDst_;
int dstAxDim_;
int strideAx1Diff_;
std::string errorPrefix_;
};
REG_FACTORY_FOR(GatherElementsImpl, GatherElements);
} // namespace Cpu
} // namespace Extensions
} // namespace InferenceEngine

View File

@ -1,230 +0,0 @@
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "base.hpp"
#include <string>
#include <vector>
#include "ie_parallel.hpp"
#include "common/cpu_memcpy.h"
namespace InferenceEngine {
namespace Extensions {
namespace Cpu {
class GatherNDImpl: public ExtLayerBase {
public:
explicit GatherNDImpl(const CNNLayer* layer) {
_errorPrefix = std::string("Layer GatherND with name '") + layer->name + "'";
if (layer->insData.size() != 2 || layer->outData.size() != 1)
IE_THROW() << _errorPrefix << " has invalid number of input/output edges.";
auto data = layer->insData[_dataIndex].lock();
auto indices = layer->insData[_indicesIndex].lock();
if (!data || !indices)
IE_THROW() << _errorPrefix << " has nullable inputs.";
Precision dataPrecision = data->getTensorDesc().getPrecision();
if (dataPrecision.size() != sizeof(PrecisionTrait<Precision::I32>::value_type) &&
dataPrecision.size() != sizeof(PrecisionTrait<Precision::I16>::value_type) &&
dataPrecision.size() != sizeof(PrecisionTrait<Precision::I8>::value_type)) {
IE_THROW() << _errorPrefix << " has unsupported 'data' input precision: " << dataPrecision;
}
Precision indicesPrecision = indices->getTensorDesc().getPrecision();
if (indicesPrecision != Precision::I32 &&
indicesPrecision != Precision::I16 && indicesPrecision != Precision::U16 &&
indicesPrecision != Precision::I8 && indicesPrecision != Precision::U8) {
IE_THROW() << _errorPrefix << " has unsupported 'indices' input precision: " << indicesPrecision;
}
_dataTypeSize = dataPrecision.size();
const auto& dataDims = data->getTensorDesc().getDims();
const auto& indicesDims = indices->getTensorDesc().getDims();
_batchDims = layer->GetParamAsInt("batch_dims", 0);
if (_batchDims >= std::min(dataDims.size(), indicesDims.size()))
IE_THROW() << _errorPrefix << " has invalid batch_dims attribute: " << _batchDims;
_batchNum = 1lu;
for (size_t i = 0; i < _batchDims; i++) {
_batchNum *= indicesDims[i];
}
_sliceRank = indicesDims[indicesDims.size() - 1];
_dataRank = dataDims.size() - _batchDims;
if (_sliceRank > _dataRank)
IE_THROW() << _errorPrefix << " has invalid inputs shapes.";
_blockSize = 1;
for (size_t i = _sliceRank + _batchDims; i < dataDims.size(); i++) {
_blockSize *= dataDims[i];
}
_batchStep = 1;
for (size_t i = _batchDims; i < dataDims.size(); i++) {
_batchStep *= dataDims[i];
}
LayerConfig config;
DataConfig dataConfig, indicesConfig, outConfig;
dataConfig.desc = TensorDesc(dataPrecision, dataDims,
data->getTensorDesc().getLayoutByDims(dataDims));
config.inConfs.push_back(dataConfig);
indicesConfig.desc = TensorDesc(Precision::I32, indicesDims,
indices->getTensorDesc().getLayoutByDims(indicesDims));
config.inConfs.push_back(indicesConfig);
const auto& outDims = layer->outData[0]->getTensorDesc().getDims();
outConfig.desc = TensorDesc(dataPrecision, outDims,
layer->outData[0]->getTensorDesc().getLayoutByDims(outDims));
config.outConfs.push_back(outConfig);
config.dynBatchSupport = false;
confs.push_back(config);
}
StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, ResponseDesc *resp) noexcept override {
if (_blockSize > 1) {
gatherBlocks(inputs, outputs, resp);
} else {
switch (_dataTypeSize) {
case sizeof(PrecisionTrait<Precision::I32>::value_type):
gatherElementwise<PrecisionTrait<Precision::I32>::value_type>(inputs, outputs, resp);
break;
case sizeof(PrecisionTrait<Precision::I16>::value_type):
gatherElementwise<PrecisionTrait<Precision::I16>::value_type>(inputs, outputs, resp);
break;
case sizeof(PrecisionTrait<Precision::I8>::value_type):
gatherElementwise<PrecisionTrait<Precision::I8>::value_type>(inputs, outputs, resp);
break;
default:
std::string errMsg = _errorPrefix + " has data input with unsupported precision: " +
inputs[_dataIndex]->getTensorDesc().getPrecision().name();
errMsg.copy(resp->msg, sizeof(resp->msg) - 1);
return GENERAL_ERROR;
}
}
return OK;
}
protected:
template <typename dataType>
void gatherElementwise(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, ResponseDesc *resp) noexcept {
const dataType* srcData = inputs[_dataIndex]->cbuffer().as<const dataType*>() +
inputs[_dataIndex]->getTensorDesc().getBlockingDesc().getOffsetPadding();
const int* indices = inputs[_indicesIndex]->cbuffer().as<const int*>() +
inputs[_indicesIndex]->getTensorDesc().getBlockingDesc().getOffsetPadding();
dataType* dstData = outputs[0]->buffer().as<dataType*>() +
outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
const size_t* srcMultipliers = inputs[_dataIndex]->getTensorDesc().getBlockingDesc().getStrides().data() + _batchDims;
const size_t cycles = outputs[0]->byteSize() / (sizeof(dataType) * _batchNum);
const size_t CS = cycles * _sliceRank;
const size_t CB = cycles * _blockSize;
const size_t workAmount = _batchNum * cycles;
auto threadBody = [&](const int ithr, const int nthr) {
size_t start(0lu), end(0lu);
splitter(workAmount, nthr, ithr, start, end);
if (start >= end)
return;
size_t bStart = start / cycles;
size_t cStart = start % cycles;
size_t workCounter = start;
const dataType* shiftedSrcData = srcData + bStart * _batchStep;
const int* shiftedIndices = indices + bStart * CS + cStart * _sliceRank;
dataType* shiftedDstData = dstData + bStart * CB + cStart * _blockSize;
for (size_t b = bStart; b < _batchNum; b++) {
for (size_t j = cStart; j < cycles; j++) {
size_t dataIdx = 0lu;
for (size_t i = 0lu; i < _sliceRank; i++)
dataIdx += srcMultipliers[i] * shiftedIndices[i];
shiftedDstData[0] = shiftedSrcData[dataIdx];
shiftedDstData++;
shiftedIndices += _sliceRank;
if (++workCounter == end) {
return;
}
}
cStart = 0lu;
shiftedSrcData += _batchStep;
}
};
parallel_nt(0, threadBody);
}
void gatherBlocks(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, ResponseDesc *resp) noexcept {
const uint8_t* srcData = inputs[_dataIndex]->cbuffer().as<const uint8_t*>() +
inputs[_dataIndex]->getTensorDesc().getBlockingDesc().getOffsetPadding();
const int* indices = inputs[_indicesIndex]->cbuffer().as<const int*>() +
inputs[_indicesIndex]->getTensorDesc().getBlockingDesc().getOffsetPadding();
uint8_t* dstData = outputs[0]->buffer().as<uint8_t*>() +
outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
std::vector<size_t> srcMultipliers(_sliceRank);
for (size_t i = 0; i < _sliceRank ; i++)
srcMultipliers[i] = _dataTypeSize * inputs[_dataIndex]->getTensorDesc().getBlockingDesc().getStrides()[i + _batchDims];
const size_t batchStep = _batchStep * _dataTypeSize;
const size_t dataStep = _blockSize * _dataTypeSize;
const size_t cycles = outputs[0]->byteSize() / (dataStep * _batchNum);
const size_t CS = cycles * _sliceRank;
const size_t CB = cycles * dataStep;
const size_t workAmount = _batchNum * cycles;
auto threadBody = [&](const int ithr, const int nthr) {
size_t start(0lu), end(0lu);
splitter(workAmount, nthr, ithr, start, end);
if (start >= end)
return;
size_t bStart = start / cycles;
size_t cStart = start % cycles;
size_t workCounter = start;
const uint8_t* shiftedSrcData = srcData + bStart * batchStep;
const int* shiftedIndices = indices + bStart * CS + cStart * _sliceRank;
uint8_t* shiftedDstData = dstData + bStart * CB + cStart * dataStep;
for (size_t b = bStart; b < _batchNum; b++) {
for (size_t j = cStart; j < cycles; j++) {
size_t dataIdx = 0lu;
for (size_t i = 0; i < _sliceRank ; i++)
dataIdx += srcMultipliers[i] * shiftedIndices[i];
cpu_memcpy(shiftedDstData, &(shiftedSrcData[dataIdx]), dataStep);
shiftedDstData += dataStep;
shiftedIndices += _sliceRank;
if (++workCounter == end) {
return;
}
}
cStart = 0;
shiftedSrcData += batchStep;
}
};
parallel_nt(0, threadBody);
}
size_t _dataRank;
size_t _sliceRank;
size_t _blockSize;
size_t _batchDims;
size_t _batchNum;
size_t _batchStep;
size_t _dataTypeSize;
const size_t _dataIndex = 0;
const size_t _indicesIndex = 1;
std::string _errorPrefix;
};
REG_FACTORY_FOR(GatherNDImpl, GatherND);
} // namespace Cpu
} // namespace Extensions
} // namespace InferenceEngine

View File

@ -3,6 +3,9 @@
//
#include "base.hpp"
#include <ngraph/op/gather_tree.hpp>
#include <nodes/common/tensor_desc_creator.h>
#include <utils/general_utils.h>
#include <cmath>
#include <limits>
@ -17,45 +20,71 @@ namespace InferenceEngine {
namespace Extensions {
namespace Cpu {
using MKLDNNPlugin::TensorDescCreatorTypes;
class GatherTreeImpl: public ExtLayerBase {
public:
explicit GatherTreeImpl(const CNNLayer* layer) {
static bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
try {
if (layer->insData.empty() || layer->outData.empty())
IE_THROW() << layer->name << " Incorrect number of input/output edges.";
auto gatherElementsOp = ngraph::as_type_ptr<const ngraph::op::v1::GatherTree>(op);
if (!gatherElementsOp) {
errorMessage = "Node is not an instance of the GatherTree operation from operation set v1.";
return false;
}
if (layer->insData.size() != 4)
IE_THROW() << layer->name << " Incorrect number of input edges.";
if (layer->outData.size() != 1)
IE_THROW() << layer->name << " Incorrect number of output edges.";
precision = layer->insData[GATHER_TREE_STEP_IDX].lock()->getTensorDesc().getPrecision();
if (precision != Precision::FP32 && precision != Precision::I32)
precision = Precision::FP32;
if (layer->insData[GATHER_TREE_PARENT_IDX].lock()->getTensorDesc().getPrecision() != precision ||
layer->insData[GATHER_TREE_MAX_SEQ_LEN].lock()->getTensorDesc().getPrecision() != precision ||
layer->insData[GATHER_TREE_END_TOKEN].lock()->getTensorDesc().getPrecision() != precision ||
layer->outData[0]->getTensorDesc().getPrecision() != precision)
IE_THROW() << layer->name << " Incorrect input/output data tensor precision. Should be the same.";
if (layer->insData[GATHER_TREE_STEP_IDX].lock()->getTensorDesc().getDims().size() != 3)
IE_THROW() << layer->name << " step_idx vector should be 3 dimension";
if (layer->insData[GATHER_TREE_PARENT_IDX].lock()->getTensorDesc().getDims().size() != 3)
IE_THROW() << layer->name << " parent_idx vector should be 3 dimension";
if (layer->insData[GATHER_TREE_MAX_SEQ_LEN].lock()->getTensorDesc().getDims().size() != 1)
IE_THROW() << layer->name << " max_seq_len vector should be 1 dimension";
if (layer->insData[GATHER_TREE_END_TOKEN].lock()->getTensorDesc().getDims().size() != 1)
IE_THROW() << layer->name << " end_token should be 1 dimension";
addConfig(layer, { DataConfigurator(ConfLayout::PLN, precision), DataConfigurator(ConfLayout::PLN, precision),
DataConfigurator(ConfLayout::PLN, precision), DataConfigurator(ConfLayout::PLN, precision) },
{ DataConfigurator(ConfLayout::PLN, precision) });
} catch (InferenceEngine::Exception &ex) {
errorMsg = ex.what();
auto precision = op->get_input_element_type(GATHER_TREE_STEP_IDX);
if (!MKLDNNPlugin::one_of(precision, ngraph::element::f32, ngraph::element::i32))
precision = ngraph::element::f32;
if (op->get_input_element_type(GATHER_TREE_PARENT_IDX) != precision ||
op->get_input_element_type(GATHER_TREE_MAX_SEQ_LEN) != precision ||
op->get_input_element_type(GATHER_TREE_END_TOKEN) != precision ||
op->get_output_element_type(0) != precision) {
errorMessage = "Node has incorrect input/output data precision. Must be the same.";
return false;
}
} catch (...) {
return false;
}
return true;
}
explicit GatherTreeImpl(const std::shared_ptr<ngraph::Node>& op) {
try {
std::string errorMessage;
if (!isSupportedOperation(op, errorMessage)) {
IE_THROW(NotImplemented) << errorMessage;
}
std::string errorPrefix = std::string("Node GatherTree with name '") + op->get_friendly_name() + "'";
if (op->get_input_size() != 4)
IE_THROW() << errorPrefix << " has incorrect number of input edges.";
if (op->get_output_size() != 1)
IE_THROW() << errorPrefix << " has incorrect number of output edges.";
precision = details::convertPrecision(op->get_input_element_type(GATHER_TREE_STEP_IDX));
if (!MKLDNNPlugin::one_of(precision, Precision::FP32, Precision::I32))
precision = Precision::FP32;
if (op->get_input_shape(GATHER_TREE_STEP_IDX).size() != 3)
IE_THROW() << errorPrefix << " step_idx vector should be 3 dimension";
if (op->get_input_shape(GATHER_TREE_PARENT_IDX).size() != 3)
IE_THROW() << errorPrefix << " parent_idx vector should be 3 dimension";
if (op->get_input_shape(GATHER_TREE_MAX_SEQ_LEN).size() != 1)
IE_THROW() << errorPrefix << " max_seq_len vector should be 1 dimension";
if (op->get_input_shape(GATHER_TREE_END_TOKEN).size() != 0)
IE_THROW() << errorPrefix << " end_token should be 1 dimension";
addConfig(op, {{TensorDescCreatorTypes::ncsp, precision},
{TensorDescCreatorTypes::ncsp, precision},
{TensorDescCreatorTypes::ncsp, precision},
{TensorDescCreatorTypes::ncsp, precision}},
{{TensorDescCreatorTypes::ncsp, precision}});
} catch (InferenceEngine::Exception &ex) {
errorMsg = ex.what();
throw;
}
}
StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, ResponseDesc *resp) noexcept override {
if (precision == Precision::FP32)
@ -140,10 +169,10 @@ public:
}
private:
const size_t GATHER_TREE_STEP_IDX = 0;
const size_t GATHER_TREE_PARENT_IDX = 1;
const size_t GATHER_TREE_MAX_SEQ_LEN = 2;
const size_t GATHER_TREE_END_TOKEN = 3;
static const size_t GATHER_TREE_STEP_IDX = 0;
static const size_t GATHER_TREE_PARENT_IDX = 1;
static const size_t GATHER_TREE_MAX_SEQ_LEN = 2;
static const size_t GATHER_TREE_END_TOKEN = 3;
InferenceEngine::Precision precision;
};

View File

@ -8,21 +8,48 @@
#include <string>
#include <vector>
#include "ie_parallel.hpp"
#include <ngraph/opsets/opset1.hpp>
using namespace MKLDNNPlugin;
namespace InferenceEngine {
namespace Extensions {
namespace Cpu {
class GRNImpl: public ExtLayerBase {
public:
explicit GRNImpl(const CNNLayer* layer) {
bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
try {
if (layer->insData.size() != 1 || layer->outData.empty())
IE_THROW() << "Incorrect number of input/output edges!";
const auto grn = std::dynamic_pointer_cast<const ngraph::opset1::GRN>(op);
if (!grn) {
errorMessage = "Only opset1 GRN operation is supported";
return false;
}
} catch (...) {
return false;
}
return true;
}
bias = layer->GetParamAsFloat("bias");
std::string errorPrefix;
addConfig(layer, {{ConfLayout::PLN, false, 0, Precision::FP32}}, {{ConfLayout::PLN, false, 0, Precision::FP32}});
public:
explicit GRNImpl(const std::shared_ptr<ngraph::Node>& op) {
try {
std::string errorMessage;
if (!isSupportedOperation(op, errorMessage)) {
IE_THROW(NotImplemented) << errorMessage;
}
errorPrefix = "GRN layer with name '" + op->get_friendly_name() + "'";
const auto grn = std::dynamic_pointer_cast<const ngraph::opset1::GRN>(op);
if (op->get_input_size() != 1 || op->get_output_size() != 1)
IE_THROW() << errorPrefix << " has incorrect number of input/output edges!";
bias = grn->get_bias();
addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32, false, 0}},
{{TensorDescCreatorTypes::ncsp, Precision::FP32, false, 0}});
} catch (InferenceEngine::Exception &ex) {
errorMsg = ex.what();
}

View File

@ -7,12 +7,12 @@
#include <mkldnn_selective_build.h>
#include <ie_iextension.h>
#include <legacy/ie_layers.h>
#include <string>
#include <map>
#include <memory>
#include <algorithm>
#include <ngraph/node.hpp>
namespace InferenceEngine {
@ -43,7 +43,7 @@ public:
namespace Extensions {
namespace Cpu {
using ext_factory = std::function<InferenceEngine::ILayerImplFactory*(const InferenceEngine::CNNLayer*)>;
using ext_factory = std::function<InferenceEngine::ILayerImplFactory*(const std::shared_ptr<ngraph::Node>& op)>;
struct ExtensionsHolder {
std::map<std::string, ext_factory> list;
@ -60,11 +60,11 @@ public:
}
virtual StatusCode
getFactoryFor(ILayerImplFactory*& factory, const CNNLayer* cnnLayer, ResponseDesc* resp) noexcept {
getFactoryFor(ILayerImplFactory*& factory, const std::shared_ptr<ngraph::Node>& op, ResponseDesc* resp) noexcept {
using namespace MKLDNNPlugin;
factory = layersFactory.createNodeIfRegistered(MKLDNNPlugin, cnnLayer->type, cnnLayer);
factory = layersFactory.createNodeIfRegistered(MKLDNNPlugin, op->get_type_name(), op);
if (!factory) {
std::string errorMsg = std::string("Factory for ") + cnnLayer->type + " wasn't found!";
std::string errorMsg = std::string("Factory for ") + op->get_type_name() + " wasn't found!";
errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
return NOT_FOUND;
}
@ -85,7 +85,7 @@ public:
using LayersFactory = openvino::cc::Factory<
std::string,
InferenceEngine::ILayerImplFactory*(const InferenceEngine::CNNLayer*)>;
InferenceEngine::ILayerImplFactory*(const std::shared_ptr<ngraph::Node>& op)>;
LayersFactory layersFactory;

View File

@ -7,11 +7,7 @@
# define MKLDNN_EXTENSION_NODE(__prim, __type)
#endif
MKLDNN_EXTENSION_NODE(EmbeddingBagOffsetsSumImpl, EmbeddingBagOffsetsSum);
MKLDNN_EXTENSION_NODE(EmbeddingBagPackedSumImpl, EmbeddingBagPackedSum);
MKLDNN_EXTENSION_NODE(EmbeddingSegmentsSumImpl, EmbeddingSegmentsSum);
MKLDNN_EXTENSION_NODE(CTCLossImpl, CTCLoss);
MKLDNN_EXTENSION_NODE(PriorBoxImpl, PriorBox);
MKLDNN_EXTENSION_NODE(MathImpl, Abs);
MKLDNN_EXTENSION_NODE(MathImpl, Acos);
MKLDNN_EXTENSION_NODE(MathImpl, Acosh);
@ -38,44 +34,20 @@ MKLDNN_EXTENSION_NODE(ExperimentalDetectronTopKROIsImpl, ExperimentalDetectronTo
MKLDNN_EXTENSION_NODE(ExtractImagePatchesImpl, ExtractImagePatches);
MKLDNN_EXTENSION_NODE(ReverseSequenceImpl, ReverseSequence);
MKLDNN_EXTENSION_NODE(DetectionOutputImpl, DetectionOutput);
MKLDNN_EXTENSION_NODE(ArgMaxImpl, ArgMax);
MKLDNN_EXTENSION_NODE(UnsqueezeImpl, Unsqueeze);
MKLDNN_EXTENSION_NODE(ExperimentalDetectronDetectionOutputImpl, ExperimentalDetectronDetectionOutput);
MKLDNN_EXTENSION_NODE(RegionYoloImpl, RegionYolo);
MKLDNN_EXTENSION_NODE(LogSoftmaxImpl, LogSoftmax);
MKLDNN_EXTENSION_NODE(ReorgYoloImpl, ReorgYolo);
MKLDNN_EXTENSION_NODE(SqueezeImpl, Squeeze);
MKLDNN_EXTENSION_NODE(FillImpl, Fill);
MKLDNN_EXTENSION_NODE(UniqueImpl, Unique);
MKLDNN_EXTENSION_NODE(PSROIPoolingImpl, PSROIPooling);
MKLDNN_EXTENSION_NODE(OneHotImpl, OneHot);
MKLDNN_EXTENSION_NODE(BroadcastImpl, Broadcast);
MKLDNN_EXTENSION_NODE(ExperimentalSparseWeightedReduceImpl, ExperimentalSparseWeightedSum);
MKLDNN_EXTENSION_NODE(SparseToDenseImpl, SparseToDense);
MKLDNN_EXTENSION_NODE(ExperimentalDetectronROIFeatureExtractorImpl, ExperimentalDetectronROIFeatureExtractor);
MKLDNN_EXTENSION_NODE(ONNXCustomProposalImpl, ExperimentalDetectronGenerateProposalsSingleImage);
MKLDNN_EXTENSION_NODE(NonMaxSuppressionImpl, NonMaxSuppression);
MKLDNN_EXTENSION_NODE(ExperimentalDetectronGenerateProposalsSingleImageImpl, ExperimentalDetectronGenerateProposalsSingleImage);
MKLDNN_EXTENSION_NODE(NonMaxSuppressionImpl, NonMaxSuppressionIEInternal);
MKLDNN_EXTENSION_NODE(TopKImpl, TopK);
MKLDNN_EXTENSION_NODE(ShuffleChannelsImpl, ShuffleChannels);
MKLDNN_EXTENSION_NODE(PowerFileImpl, PowerFile);
MKLDNN_EXTENSION_NODE(BatchToSpaceImpl, BatchToSpace);
MKLDNN_EXTENSION_NODE(ExperimentalDetectronPriorGridGeneratorImpl, ExperimentalDetectronPriorGridGenerator);
MKLDNN_EXTENSION_NODE(SimplerNMSImpl, SimplerNMS);
MKLDNN_EXTENSION_NODE(GRNImpl, GRN);
MKLDNN_EXTENSION_NODE(SparseFillEmptyRowsImpl, SparseFillEmptyRows);
MKLDNN_EXTENSION_NODE(BucketizeImpl, Bucketize);
MKLDNN_EXTENSION_NODE(CTCGreedyDecoderImpl, CTCGreedyDecoder);
MKLDNN_EXTENSION_NODE(CTCGreedyDecoderSeqLenImpl, CTCGreedyDecoderSeqLen);
MKLDNN_EXTENSION_NODE(GatherImpl, Gather);
MKLDNN_EXTENSION_NODE(GatherElementsImpl, GatherElements);
MKLDNN_EXTENSION_NODE(GatherNDImpl, GatherND);
MKLDNN_EXTENSION_NODE(ProposalImpl, Proposal);
MKLDNN_EXTENSION_NODE(RangeImpl, Range);
MKLDNN_EXTENSION_NODE(SelectImpl, Select);
MKLDNN_EXTENSION_NODE(GatherTreeImpl, GatherTree);
MKLDNN_EXTENSION_NODE(PriorBoxClusteredImpl, PriorBoxClustered);
MKLDNN_EXTENSION_NODE(SpaceToBatchImpl, SpaceToBatch);
MKLDNN_EXTENSION_NODE(SparseSegmentReduceImpl, SparseSegmentMean);
MKLDNN_EXTENSION_NODE(SparseSegmentReduceImpl, SparseSegmentSqrtN);
MKLDNN_EXTENSION_NODE(SparseSegmentReduceImpl, SparseSegmentSum);
MKLDNN_EXTENSION_NODE(CumSumImpl, CumSum);

View File

@ -11,30 +11,51 @@
#include <vector>
#include <cassert>
#include "ie_parallel.hpp"
#include <ngraph/opsets/opset5.hpp>
using namespace MKLDNNPlugin;
namespace InferenceEngine {
namespace Extensions {
namespace Cpu {
class LogSoftmaxImpl: public ExtLayerBase {
public:
explicit LogSoftmaxImpl(const CNNLayer* layer) {
bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
try {
if (layer->insData.empty() || layer->outData.empty())
IE_THROW() << layer->name << " Incorrect number of input/output edges!";
const auto logSoftMax = std::dynamic_pointer_cast<const ngraph::opset5::LogSoftmax>(op);
if (!logSoftMax) {
errorMessage = "Only opset5 LogSoftmax operation is supported";
return false;
}
} catch (...) {
return false;
}
return true;
}
if (layer->insData.size() != 1)
IE_THROW() << layer->name << " Incorrect number of input edges!";
public:
explicit LogSoftmaxImpl(const std::shared_ptr<ngraph::Node>& op) {
try {
std::string errorMessage;
if (!isSupportedOperation(op, errorMessage)) {
IE_THROW(NotImplemented) << errorMessage;
}
SizeVector dims = layer->insData[0].lock()->getTensorDesc().getDims();
errorPrefix = "LogSoftmax layer with name '" + op->get_friendly_name() + "'";
const auto logSoftMax = std::dynamic_pointer_cast<const ngraph::opset5::LogSoftmax>(op);
if (op->get_input_size() != 1 || op->get_output_size() != 1)
IE_THROW() << errorPrefix << " has incorrect number of input/output edges!";
SizeVector dims = op->get_input_shape(0);
if (!dims.size())
dims = SizeVector(1, 1);
int axis = layer->GetParamAsInt("axis", -1);
int axis = logSoftMax->get_axis();
if (axis < 0)
axis += dims.size();
if (dims.size() < static_cast<size_t>((size_t)(1) + axis))
IE_THROW() << layer->name << " Incorrect input parameters dimensions and axis number!";
IE_THROW() << errorPrefix << " has incorrect input parameters dimensions and axis number!";
int j;
for (j = dims.size() - 1; j >= 0; j--) {
@ -48,7 +69,8 @@ public:
for (size_t i = (axis + 1); i < dims.size(); i++)
reduced_axis_stride *= dims[i];
addConfig(layer, { { ConfLayout::PLN, false, 0, Precision::FP32 } }, { { ConfLayout::PLN, false, 0, Precision::FP32 } });
addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32}},
{{TensorDescCreatorTypes::ncsp, Precision::FP32}});
} catch (InferenceEngine::Exception &ex) {
errorMsg = ex.what();
}
@ -103,6 +125,8 @@ private:
size_t reduced_axis_stride = 1;
size_t axis_step = 1;
bool is_last_dim = false;
std::string errorPrefix;
};
REG_FACTORY_FOR(LogSoftmaxImpl, LogSoftmax);

View File

@ -8,87 +8,67 @@
#include <string>
#include <vector>
#include <cassert>
#include "ie_parallel.hpp"
#include "common/tensor_desc_creator.h"
#include "utils/general_utils.h"
#include <ngraph/ops.hpp>
namespace InferenceEngine {
namespace Extensions {
namespace Cpu {
class MathImpl: public ExtLayerBase {
static float error_function(float x) {
const float clip_bound = 2.86f;
// Points clip_bound and -clip_bound are extremums for this polynom
// So in order to provide better accuracy comparing to std::erf we have to clip input range
if (x > clip_bound)
return 1;
if (x < -clip_bound)
return -1;
using MKLDNNPlugin::TensorDescCreatorTypes;
// A polynomial approximation of the error function
const float erfNumerator[4] = { 90.0260162353515625f, 2232.00537109375f,
7003.3251953125f, 55592.30078125f };
const float erfDenominator[5] = { 33.56171417236328125f, 521.35797119140625f,
4594.32373046875f, 22629.0f, 49267.39453125f };
float polynom = 9.60497379302978515625f;
float x2 = x * x;
for (float c : erfNumerator) {
polynom = polynom * x2 + c;
class MathImpl: public ExtLayerBase {
public:
bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
try {
if (initializers.find(op->get_type_info()) == initializers.end()) {
errorMessage = "Unsupported Math layer type.";
return false;
}
if (MKLDNNPlugin::one_of(op->get_type_info(),
ngraph::op::v0::HardSigmoid::type_info,
ngraph::op::v0::Selu::type_info)) {
auto firstConst = ngraph::as_type_ptr<ngraph::op::v0::Constant>(op->get_input_node_shared_ptr(1));
auto secondConst = ngraph::as_type_ptr<ngraph::op::v0::Constant>(op->get_input_node_shared_ptr(2));
if (!firstConst || !secondConst) {
errorMessage = "Constant expected as the second and third inputs.";
return false;
}
}
} catch (...) {
return false;
}
x *= polynom;
polynom = 1.0f;
for (float c : erfDenominator) {
polynom = polynom * x2 + c;
}
return x / polynom;
return true;
}
public:
explicit MathImpl(const CNNLayer* layer) {
explicit MathImpl(const std::shared_ptr<ngraph::Node>& op) :
alpha(0.f), beta(0.f), gamma(0.f) {
try {
if (layer->insData.empty() || layer->outData.empty())
IE_THROW() << layer->name << " Incorrect number of input/output edges!";
std::string errorMessage;
if (!isSupportedOperation(op, errorMessage)) {
IE_THROW(NotImplemented) << errorMessage;
}
if (layer->insData.size() != 1)
IE_THROW() << layer->name << " Incorrect number of input edges!";
initializers[op->get_type_info()](op, *this);
if (layer->insData[0].lock()->getTensorDesc().getDims() != layer->outData[0]->getTensorDesc().getDims())
IE_THROW() << layer->name << " Incorrect number of input/output dimensions!";
alpha = layer->GetParamAsFloat("alpha", 0.0f);
beta = layer->GetParamAsFloat("beta", 0.0f);
gamma = layer->GetParamAsFloat("gamma", 0.0f);
std::string math_func = layer->type;
if (math_func == "Erf") mathFunction = Math::Erf;
else if (math_func == "Abs") mathFunction = Math::Abs;
else if (math_func == "Acos") mathFunction = Math::Acos;
else if (math_func == "Acosh") mathFunction = Math::Acosh;
else if (math_func == "Asin") mathFunction = Math::Asin;
else if (math_func == "Asinh") mathFunction = Math::Asinh;
else if (math_func == "Atan") mathFunction = Math::Atan;
else if (math_func == "Atanh") mathFunction = Math::Atanh;
else if (math_func == "Ceil") mathFunction = Math::Ceil;
else if (math_func == "Ceiling") mathFunction = Math::Ceil;
else if (math_func == "Cos") mathFunction = Math::Cos;
else if (math_func == "Cosh") mathFunction = Math::Cosh;
else if (math_func == "Floor") mathFunction = Math::Floor;
else if (math_func == "HardSigmoid") mathFunction = Math::HardSigmoid;
else if (math_func == "Log") mathFunction = Math::Log;
else if (math_func == "Neg") mathFunction = Math::Neg;
else if (math_func == "Reciprocal") mathFunction = Math::Reciprocal;
else if (math_func == "Selu") mathFunction = Math::Selu;
else if (math_func == "Sign") mathFunction = Math::Sign;
else if (math_func == "Sin") mathFunction = Math::Sin;
else if (math_func == "Sinh") mathFunction = Math::Sinh;
else if (math_func == "SoftPlus") mathFunction = Math::SoftPlus;
else if (math_func == "Softsign") mathFunction = Math::Softsign;
else if (math_func == "Tan") mathFunction = Math::Tan;
else
IE_THROW() << layer->name << " Incorrect Math layer type!";
addConfig(layer, {DataConfigurator(ConfLayout::PLN, false, 0, Precision::FP32)}, {DataConfigurator(ConfLayout::PLN, false, 0, Precision::FP32)});
if (MKLDNNPlugin::one_of(op->get_type_info(),
ngraph::op::v0::HardSigmoid::type_info,
ngraph::op::v0::Selu::type_info)) {
addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32},
{TensorDescCreatorTypes::ncsp, Precision::FP32},
{TensorDescCreatorTypes::ncsp, Precision::FP32}},
{{TensorDescCreatorTypes::ncsp, Precision::FP32}});
} else {
addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32}},
{{TensorDescCreatorTypes::ncsp, Precision::FP32}});
}
} catch (InferenceEngine::Exception &ex) {
errorMsg = ex.what();
throw;
}
}
@ -99,90 +79,85 @@ public:
float* dst_data = outputs[0]->cbuffer().as<float *>() +
outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
switch (mathFunction) {
case Math::Erf:
parallel_for(dataSize, [&](size_t i) {
dst_data[i] = error_function(src_data[i]);
});
break;
case Math::Abs:
switch (getAlgorithm()) {
case MKLDNNPlugin::MathAbs:
parallel_for(dataSize, [&](size_t i) {
dst_data[i] = (std::abs)(src_data[i]);
});
break;
case Math::Acos:
case MKLDNNPlugin::MathAcos:
parallel_for(dataSize, [&](size_t i) {
dst_data[i] = acosf(src_data[i]);
});
break;
case Math::Acosh:
case MKLDNNPlugin::MathAcosh:
parallel_for(dataSize, [&](size_t i) {
dst_data[i] = acoshf(src_data[i]);
});
break;
case Math::Asin:
case MKLDNNPlugin::MathAsin:
parallel_for(dataSize, [&](size_t i) {
dst_data[i] = asinf(src_data[i]);
});
break;
case Math::Asinh:
case MKLDNNPlugin::MathAsinh:
parallel_for(dataSize, [&](size_t i) {
dst_data[i] = asinhf(src_data[i]);
});
break;
case Math::Atan:
case MKLDNNPlugin::MathAtan:
parallel_for(dataSize, [&](size_t i) {
dst_data[i] = atanf(src_data[i]);
});
break;
case Math::Atanh:
case MKLDNNPlugin::MathAtanh:
parallel_for(dataSize, [&](size_t i) {
dst_data[i] = atanhf(src_data[i]);
});
break;
case Math::Ceil:
case MKLDNNPlugin::MathCeiling:
parallel_for(dataSize, [&](size_t i) {
dst_data[i] = ceilf(src_data[i]);
});
break;
case Math::Cos:
case MKLDNNPlugin::MathCos:
parallel_for(dataSize, [&](size_t i) {
dst_data[i] = cosf(src_data[i]);
});
break;
case Math::Cosh:
case MKLDNNPlugin::MathCosh:
parallel_for(dataSize, [&](size_t i) {
dst_data[i] = coshf(src_data[i]);
});
break;
case Math::Floor:
case MKLDNNPlugin::MathFloor:
parallel_for(dataSize, [&](size_t i) {
dst_data[i] = floorf(src_data[i]);
});
break;
case Math::HardSigmoid:
case MKLDNNPlugin::MathHardSigmoid:
alpha = (alpha == 0.0f) ? 0.2f : alpha;
beta = (beta == 0.0f) ? 0.5f : beta;
parallel_for(dataSize, [&](size_t i) {
dst_data[i] = (std::max)(0.f, (std::min)(1.f, alpha * src_data[i] + beta));
});
break;
case Math::Log:
case MKLDNNPlugin::MathLog:
parallel_for(dataSize, [&](size_t i) {
dst_data[i] = logf(src_data[i]);
});
break;
case Math::Neg:
case MKLDNNPlugin::MathNegative:
parallel_for(dataSize, [&](size_t i) {
dst_data[i] = -src_data[i];
});
break;
case Math::Reciprocal:
case MKLDNNPlugin::MathReciprocal:
parallel_for(dataSize, [&](size_t i) {
dst_data[i] = 1.0f / src_data[i];
});
break;
case Math::Selu:
case MKLDNNPlugin::MathSelu:
alpha = (alpha == 0.0f) ? 1.67326f : alpha;
gamma = (gamma == 0.0f) ? 1.0507f : gamma;
parallel_for(dataSize, [&](size_t i) {
@ -190,7 +165,7 @@ public:
dst_data[i] = (x > 0.0f) ? (gamma * x) : (gamma * alpha * (exp(x) - 1.0f));
});
break;
case Math::Sign:
case MKLDNNPlugin::MathSign:
parallel_for(dataSize, [&](size_t i) {
if (src_data[i] > 0.0f)
dst_data[i] = 1.0f;
@ -200,28 +175,28 @@ public:
dst_data[i] = 0.0f;
});
break;
case Math::Sin:
case MKLDNNPlugin::MathSin:
parallel_for(dataSize, [&](size_t i) {
dst_data[i] = sinf(src_data[i]);
});
break;
case Math::Sinh:
case MKLDNNPlugin::MathSinh:
parallel_for(dataSize, [&](size_t i) {
dst_data[i] = sinhf(src_data[i]);
});
break;
case Math::SoftPlus:
case MKLDNNPlugin::MathSoftPlus:
parallel_for(dataSize, [&](size_t i) {
dst_data[i] = logf(expf(src_data[i]) + 1);
});
break;
case Math::Softsign:
case MKLDNNPlugin::MathSoftsign:
parallel_for(dataSize, [&](size_t i) {
float x = src_data[i];
dst_data[i] = x / (1.f + (std::abs)(x));
});
break;
case Math::Tan:
case MKLDNNPlugin::MathTan:
parallel_for(dataSize, [&](size_t i) {
dst_data[i] = tanf(src_data[i]);
});
@ -237,38 +212,80 @@ public:
}
private:
enum class Math {
Abs,
Acos,
Acosh,
Asin,
Asinh,
Atan,
Atanh,
Ceil,
Cos,
Cosh,
Erf,
Floor,
HardSigmoid,
Log,
Neg,
Reciprocal,
Selu,
Sign,
Sin,
Sinh,
SoftPlus,
Softsign,
Tan
};
static std::map<const ngraph::DiscreteTypeInfo, std::function<void(const std::shared_ptr<ngraph::Node>&, MathImpl& node)>> initializers;
Math mathFunction = Math::Erf;
float alpha = 0.0f;
float beta = 0.0f;
float gamma = 0.0f;
};
std::map<const ngraph::DiscreteTypeInfo, std::function<void(const std::shared_ptr<ngraph::Node>&, MathImpl& node)>> MathImpl::initializers = {
{ngraph::op::v0::Abs::type_info, [](const std::shared_ptr<ngraph::Node>& op, MathImpl& node) {
node.algorithm = MKLDNNPlugin::MathAbs;
}},
{ngraph::op::v0::Acos::type_info, [](const std::shared_ptr<ngraph::Node>& op, MathImpl& node) {
node.algorithm = MKLDNNPlugin::MathAcos;
}},
{ngraph::op::v3::Acosh::type_info, [](const std::shared_ptr<ngraph::Node>& op, MathImpl& node) {
node.algorithm = MKLDNNPlugin::MathAcosh;
}},
{ngraph::op::v0::Asin::type_info, [](const std::shared_ptr<ngraph::Node>& op, MathImpl& node) {
node.algorithm = MKLDNNPlugin::MathAsin;
}},
{ngraph::op::v3::Asinh::type_info, [](const std::shared_ptr<ngraph::Node>& op, MathImpl& node) {
node.algorithm = MKLDNNPlugin::MathAsinh;
}},
{ngraph::op::v0::Atan::type_info, [](const std::shared_ptr<ngraph::Node>& op, MathImpl& node) {
node.algorithm = MKLDNNPlugin::MathAtan;
}},
{ngraph::op::v0::Ceiling::type_info, [](const std::shared_ptr<ngraph::Node>& op, MathImpl& node) {
node.algorithm = MKLDNNPlugin::MathCeiling;
}},
{ngraph::op::v0::Cos::type_info, [](const std::shared_ptr<ngraph::Node>& op, MathImpl& node) {
node.algorithm = MKLDNNPlugin::MathCos;
}},
{ngraph::op::v0::Cosh::type_info, [](const std::shared_ptr<ngraph::Node>& op, MathImpl& node) {
node.algorithm = MKLDNNPlugin::MathCosh;
}},
{ngraph::op::v0::Floor::type_info, [](const std::shared_ptr<ngraph::Node>& op, MathImpl& node) {
node.algorithm = MKLDNNPlugin::MathFloor;
}},
{ngraph::op::v0::HardSigmoid::type_info, [](const std::shared_ptr<ngraph::Node>& op, MathImpl& node) {
node.algorithm = MKLDNNPlugin::MathHardSigmoid;
node.alpha = ngraph::as_type_ptr<ngraph::op::v0::Constant>(op->get_input_node_shared_ptr(1))->cast_vector<float>()[0];
node.beta = ngraph::as_type_ptr<ngraph::op::v0::Constant>(op->get_input_node_shared_ptr(2))->cast_vector<float>()[0];
}},
{ngraph::op::v0::Log::type_info, [](const std::shared_ptr<ngraph::Node>& op, MathImpl& node) {
node.algorithm = MKLDNNPlugin::MathLog;
}},
{ngraph::op::v0::Negative::type_info, [](const std::shared_ptr<ngraph::Node>& op, MathImpl& node) {
node.algorithm = MKLDNNPlugin::MathNegative;
}},
{ngraph::op::v0::Selu::type_info, [](const std::shared_ptr<ngraph::Node>& op, MathImpl& node) {
node.algorithm = MKLDNNPlugin::MathSelu;
node.alpha = ngraph::as_type_ptr<ngraph::op::v0::Constant>(op->get_input_node_shared_ptr(1))->cast_vector<float>()[0];
node.gamma = ngraph::as_type_ptr<ngraph::op::v0::Constant>(op->get_input_node_shared_ptr(2))->cast_vector<float>()[0];
}},
{ngraph::op::v0::Sign::type_info, [](const std::shared_ptr<ngraph::Node>& op, MathImpl& node) {
node.algorithm = MKLDNNPlugin::MathSign;
}},
{ngraph::op::v0::Sin::type_info, [](const std::shared_ptr<ngraph::Node>& op, MathImpl& node) {
node.algorithm = MKLDNNPlugin::MathSin;
}},
{ngraph::op::v0::Sinh::type_info, [](const std::shared_ptr<ngraph::Node>& op, MathImpl& node) {
node.algorithm = MKLDNNPlugin::MathSinh;
}},
{ngraph::op::v4::SoftPlus::type_info, [](const std::shared_ptr<ngraph::Node>& op, MathImpl& node) {
node.algorithm = MKLDNNPlugin::MathSoftPlus;
}},
{ngraph::op::v0::Tan::type_info, [](const std::shared_ptr<ngraph::Node>& op, MathImpl& node) {
node.algorithm = MKLDNNPlugin::MathTan;
}},
{ngraph::op::v3::Atanh::type_info, [](const std::shared_ptr<ngraph::Node>& op, MathImpl& node) {
node.algorithm = MKLDNNPlugin::MathAtanh;
}}
};
REG_FACTORY_FOR(MathImpl, Abs);
REG_FACTORY_FOR(MathImpl, Acos);
REG_FACTORY_FOR(MathImpl, Acosh);
@ -280,7 +297,6 @@ REG_FACTORY_FOR(MathImpl, Ceil);
REG_FACTORY_FOR(MathImpl, Ceiling);
REG_FACTORY_FOR(MathImpl, Cos);
REG_FACTORY_FOR(MathImpl, Cosh);
REG_FACTORY_FOR(MathImpl, Erf);
REG_FACTORY_FOR(MathImpl, Floor);
REG_FACTORY_FOR(MathImpl, HardSigmoid);
REG_FACTORY_FOR(MathImpl, Log);

View File

@ -0,0 +1,237 @@
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <cmath>
#include <vector>
#include <string>
#include <mkldnn_types.h>
#include "ie_parallel.hpp"
#include "utils/bfloat16.hpp"
#include <mkldnn_selective_build.h>
#include "mkldnn_batch_to_space_node.h"
#include <nodes/common/tensor_desc_creator.h>
#include <ngraph/opsets/opset2.hpp>
using namespace MKLDNNPlugin;
using namespace InferenceEngine;
bool MKLDNNBatchToSpaceNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
try {
const auto batchToSpace = std::dynamic_pointer_cast<const ngraph::opset2::BatchToSpace>(op);
if (!batchToSpace) {
errorMessage = "Only opset2 BatchToSpace operation is supported";
return false;
}
if (std::dynamic_pointer_cast<const ngraph::opset1::Constant>(op->get_input_node_shared_ptr(1)) == nullptr ||
std::dynamic_pointer_cast<const ngraph::opset1::Constant>(op->get_input_node_shared_ptr(2)) == nullptr ||
std::dynamic_pointer_cast<const ngraph::opset1::Constant>(op->get_input_node_shared_ptr(3)) == nullptr) {
errorMessage = "Only constant 'block_shape', 'crops_begin', 'crops_end' are supported";
return false;
}
} catch (...) {
return false;
}
return true;
}
MKLDNNBatchToSpaceNode::MKLDNNBatchToSpaceNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng,
MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) {
std::string errorMessage;
if (!isSupportedOperation(op, errorMessage)) {
IE_THROW(NotImplemented) << errorMessage;
}
errorPrefix = "BatchToSpace layer with name '" + op->get_friendly_name() + "'";
if (op->get_input_size() != 4 || op->get_output_size() != 1)
IE_THROW() << errorPrefix << " has incorrect number of input or output edges!";
inDims = op->get_input_shape(0);
outDims = op->get_output_shape(0);
if (inDims.size() < 4 || inDims.size() > 5)
IE_THROW() << errorPrefix << " has unsupported 'data' input rank: " << inDims.size();
if (inDims.size() != outDims.size())
IE_THROW() << errorPrefix << " has incorrect number of input/output dimensions";
blockShapeIn = std::dynamic_pointer_cast<const ngraph::opset1::Constant>(op->get_input_node_shared_ptr(1))->cast_vector<size_t>();
cropsBeginIn = std::dynamic_pointer_cast<const ngraph::opset1::Constant>(op->get_input_node_shared_ptr(2))->cast_vector<size_t>();
}
void MKLDNNBatchToSpaceNode::initSupportedPrimitiveDescriptors() {
if (!supportedPrimitiveDescriptors.empty())
return;
const auto precision = getOriginalInputPrecisionAtPort(0);
const std::set<size_t> supported_precision_sizes = {1, 2, 4, 8};
if (supported_precision_sizes.find(precision.size()) == supported_precision_sizes.end())
IE_THROW() << errorPrefix << " has unsupported precision: " << precision.name();
addSupportedPrimDesc({{TensorDescCreatorTypes::nspc, precision},
{TensorDescCreatorTypes::ncsp},
{TensorDescCreatorTypes::ncsp},
{TensorDescCreatorTypes::ncsp}},
{{TensorDescCreatorTypes::nspc, precision}},
impl_desc_type::ref_any);
addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, precision},
{TensorDescCreatorTypes::ncsp},
{TensorDescCreatorTypes::ncsp},
{TensorDescCreatorTypes::ncsp}},
{{TensorDescCreatorTypes::ncsp, precision}},
impl_desc_type::ref_any);
if (inDims[1] % 8 == 0) {
addSupportedPrimDesc({{TensorDescCreatorTypes::nCsp8c, precision},
{TensorDescCreatorTypes::ncsp},
{TensorDescCreatorTypes::ncsp},
{TensorDescCreatorTypes::ncsp}},
{{TensorDescCreatorTypes::nCsp8c, precision}},
impl_desc_type::ref_any);
}
if (inDims[1] % 16 == 0) {
addSupportedPrimDesc({{TensorDescCreatorTypes::nCsp16c, precision},
{TensorDescCreatorTypes::ncsp},
{TensorDescCreatorTypes::ncsp},
{TensorDescCreatorTypes::ncsp}},
{{TensorDescCreatorTypes::nCsp16c, precision}},
impl_desc_type::ref_any);
}
}
static std::vector<size_t> getShape5D(const SizeVector &shape) {
std::vector<size_t> shape5D(5, 1);
for (int i = 0; i < 2; i++) {
shape5D[i] = shape[i];
shape5D[4 - i] = shape[shape.size() - 1 - i];
}
shape5D[2] = shape.size() == 5 ? shape[2] : shape5D[2];
return shape5D;
}
template<typename T>
void MKLDNNBatchToSpaceNode::batchToSpaceKernel() {
const auto *srcData = reinterpret_cast<const T *>(getParentEdgeAt(0)->getMemoryPtr()->GetPtr());
auto *dstData = reinterpret_cast<T *>(getChildEdgeAt(0)->getMemoryPtr()->GetPtr());
const auto layout = getParentEdgeAt(0)->getDesc().getLayout();
const bool blocked = layout != NCHW && layout != NCDHW && layout != NHWC && layout != NDHWC;
const auto dimsSize = inDims.size();
auto inShape5D = getShape5D(inDims);
auto outShape5D = getShape5D(outDims);
auto blockShape = getShape5D(blockShapeIn);
if (layout == NHWC || layout == NDHWC) {
inShape5D.push_back(inShape5D[1]);
inShape5D.erase(inShape5D.begin() + 1);
outShape5D.push_back(outShape5D[1]);
outShape5D.erase(outShape5D.begin() + 1);
blockShape.push_back(blockShape[1]);
blockShape.erase(blockShape.begin() + 1);
}
const size_t blockSize = blocked ? getChildEdgeAt(0)->getDesc().getBlockingDesc().getBlockDims().back() : 1lu;
const size_t blockCountInput = getParentEdgeAt(0)->getDesc().getBlockingDesc().getBlockDims()[1];
const size_t blockCountOutput = getChildEdgeAt(0)->getDesc().getBlockingDesc().getBlockDims()[1];
const auto blockRemainder = inShape5D[1] % blockSize;
const auto lastBlock = blockRemainder == 0 ? blockSize : blockRemainder;
const size_t inSpatialStep = inShape5D[2] * inShape5D[3] * inShape5D[4];
const size_t inBatchStep = (blocked ? blockSize * blockCountInput : inShape5D[1]) * inSpatialStep;
const size_t outSpatialStep = outShape5D[2] * outShape5D[3] * outShape5D[4];
const size_t outBatchStep = (blocked ? blockSize * blockCountOutput : outShape5D[1]) * outSpatialStep;
size_t channels = (inShape5D[1] / blockSize);
channels = channels == 0 ? 1 : channels;
const size_t workAmount = inShape5D[0] * channels;
parallel_nt(0, [&](const int ithr, const int nthr) {
size_t start(0lu), end(0lu);
splitter(workAmount, nthr, ithr, start, end);
std::vector<size_t> indxStart(2, 0);
std::vector<size_t> indxEnd(2, 0);
parallel_it_init(start, indxStart[0], inShape5D[0], indxStart[1], channels);
parallel_it_init((end - 1), indxEnd[0], inShape5D[0], indxEnd[1], channels);
std::vector<int64_t> oAdd(5, 1);
std::vector<size_t> begin(5, 0);
std::vector<size_t> finish(5, 1);
for (size_t i0 = indxStart[0]; i0 < indxEnd[0] + 1; ++i0) {
int64_t bIdx = i0 / outShape5D[0];
const size_t srcIdx0 = i0 * inBatchStep;
const size_t dstIdx0 = (i0 - (bIdx * outShape5D[0])) * outBatchStep;
oAdd[4] = bIdx % blockShapeIn[dimsSize - 1] - cropsBeginIn[dimsSize - 1];
bIdx /= blockShapeIn[dimsSize - 1];
oAdd[3] = bIdx % blockShapeIn[dimsSize - 2] - cropsBeginIn[dimsSize - 2];
bIdx /= blockShapeIn[dimsSize - 2];
oAdd[2] = dimsSize == 5 ? bIdx % blockShapeIn[2] - cropsBeginIn[2] : 0lu;
bIdx = dimsSize == 5 ? bIdx / blockShapeIn[2] : bIdx;
oAdd[1] = bIdx % blockShapeIn[1] - cropsBeginIn[1];
if (layout == NHWC || layout == NDHWC) {
oAdd.push_back(oAdd[1]);
oAdd.erase(oAdd.begin() + 1);
}
begin[1] = (blockShape[1] - 1 - oAdd[1]) / blockShape[1] / blockSize;
finish[1] = (outShape5D[1] - 1 - oAdd[1]) / blockShape[1] / blockSize;
begin[2] = (blockShape[2] - 1 - oAdd[2]) / blockShape[2];
finish[2] = (outShape5D[2] - 1 - oAdd[2]) / blockShape[2];
begin[3] = (blockShape[3] - 1 - oAdd[3]) / blockShape[3];
finish[3] = (outShape5D[3] - 1 - oAdd[3]) / blockShape[3];
begin[4] = (blockShape[4] - 1 - oAdd[4]) / blockShape[4];
finish[4] = (outShape5D[4] - 1 - oAdd[4]) / blockShape[4];
const int64_t addTmpOC = blocked ? 0lu : oAdd[1];
const int64_t addTmpOc = blocked ? oAdd[1] : 0lu;
indxStart[1] = begin[1] > indxStart[1] ? begin[1] : indxStart[1];
const size_t lastI1 = i0 == indxEnd[0] ? (indxEnd[1] > finish[1] ? finish[1] : indxEnd[1]) : finish[1];
for (; indxStart[1] < lastI1 + 1; ++indxStart[1]) {
const size_t block = indxStart[1] == finish[1] ? lastBlock : blockSize;
const int64_t tmpOC = indxStart[1] * blockShape[1] + addTmpOC;
const size_t srcIdx1 = srcIdx0 + indxStart[1] * inSpatialStep * blockSize;
const size_t dstIdx1 = dstIdx0 + tmpOC * outSpatialStep * blockSize;
const size_t itEnd = blocked ? ((block - 1) * blockShape[1] + oAdd[1]) / blockSize : 0lu;
for (size_t i2 = begin[2]; i2 < finish[2] + 1; ++i2) {
const int64_t tmpOd = i2 * blockShape[2] + oAdd[2];
const size_t srcIdx2 = srcIdx1 + i2 * inShape5D[3] * inShape5D[4] * blockSize;
const size_t dstIdx2 = dstIdx1 + tmpOd * outShape5D[3] * outShape5D[4] * blockSize;
for (size_t i3 = begin[3]; i3 < finish[3] + 1; ++i3) {
const int64_t tmpOh = i3 * blockShape[3] + oAdd[3];
const size_t srcIdx3 = srcIdx2 + i3 * inShape5D[4] * blockSize;
const size_t dstIdx3 = dstIdx2 + tmpOh * outShape5D[4] * blockSize;
for (size_t i4 = begin[4]; i4 < finish[4] + 1; ++i4) {
const int64_t tmpOw = i4 * blockShape[4] + oAdd[4];
const size_t srcIdx4 = srcIdx3 + i4 * blockSize;
const size_t dstIdx4 = dstIdx3 + tmpOw * blockSize;
for (size_t it = 0; it < itEnd + 1; ++it) {
const size_t i5Begin = it == 0 ? 0 : (it * blockSize - 1 - oAdd[1]) / blockShape[1] + 1;
const size_t i5End = it == itEnd ? (block - 1) : ((it + 1) * blockSize - 1 - oAdd[1]) / blockShape[1];
for (size_t i5 = i5Begin; i5 < i5End + 1; ++i5) {
const int64_t tmpOc = i5 * blockShape[1] + addTmpOc;
const size_t srcIdx5 = srcIdx4 + i5;
const size_t dstIdx5 =
dstIdx4 + it * outSpatialStep * blockSize + (tmpOc - it * blockSize);
dstData[dstIdx5] = srcData[srcIdx5];
}
}
}
}
}
}
indxStart[1] = 0lu;
}
});
}
void MKLDNNBatchToSpaceNode::execute(mkldnn::stream strm) {
switch (getParentEdgeAt(0)->getDesc().getPrecision().size()) {
case 1: batchToSpaceKernel<PrecisionTrait<Precision::U8>::value_type>(); break;
case 2: batchToSpaceKernel<PrecisionTrait<Precision::U16>::value_type>(); break;
case 4: batchToSpaceKernel<PrecisionTrait<Precision::I32>::value_type>(); break;
default:
IE_THROW() << "BatchToSpace layer does not support precision '" + std::string(getParentEdgeAt(0)->getDesc().getPrecision().name()) + "'";
}
}
bool MKLDNNBatchToSpaceNode::created() const {
return getType() == BatchToSpace;
}
REG_MKLDNN_PRIM_FOR(MKLDNNBatchToSpaceNode, BatchToSpace)

View File

@ -0,0 +1,40 @@
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ie_common.h>
#include <mkldnn_node.h>
#include <string>
#include <memory>
#include <vector>
namespace MKLDNNPlugin {
class MKLDNNBatchToSpaceNode : public MKLDNNNode {
public:
MKLDNNBatchToSpaceNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
~MKLDNNBatchToSpaceNode() override = default;
void getSupportedDescriptors() override {};
void initSupportedPrimitiveDescriptors() override;
void createPrimitive() override {};
void execute(mkldnn::stream strm) override;
bool created() const override;
static bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
private:
InferenceEngine::SizeVector inDims;
InferenceEngine::SizeVector outDims;
std::vector<size_t> blockShapeIn;
std::vector<size_t> cropsBeginIn;
std::string errorPrefix;
template<typename T>
void batchToSpaceKernel();
};
} // namespace MKLDNNPlugin

View File

@ -1,281 +0,0 @@
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "mkldnn_batchnorm_node.h"
#include <mkldnn_extension_utils.h>
#include "common/cpu_memcpy.h"
using namespace mkldnn;
using namespace MKLDNNPlugin;
using namespace InferenceEngine;
MKLDNNBatchNormalizationNode::MKLDNNBatchNormalizationNode(const InferenceEngine::CNNLayerPtr& layer,
const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache)
: MKLDNNNode(layer, eng, cache) {
internalBlobDesc.emplace_back([&](primitive_desc_iterator &primitive_desc_it, size_t idx) -> MKLDNNMemoryDesc {
return GetVarianceDesc(primitive_desc_it);
});
internalBlobDesc.emplace_back([&](primitive_desc_iterator &primitive_desc_it, size_t idx) -> MKLDNNMemoryDesc {
return GetMeanDesc(primitive_desc_it);
});
internalBlobDesc.emplace_back([&](primitive_desc_iterator &primitive_desc_it, size_t idx) -> MKLDNNMemoryDesc {
if (!fusedWithScale())
return MKLDNNMemoryDesc();
return GetScaleShiftWeightsDesc(primitive_desc_it);
});
}
void MKLDNNBatchNormalizationNode::getSupportedDescriptors() {
if (!descs.empty())
return;
auto * bnLayer = dynamic_cast<BatchNormalizationLayer*>(getCnnLayer().get());
if (bnLayer == nullptr)
IE_THROW() << "Cannot convert batch normalization layer.";
if (bnLayer->_weights == nullptr || bnLayer->_biases == nullptr) {
IE_THROW() << "Weights/biases are empty for layer: " << bnLayer->name
<< " used in MKLDNN node: " << getName() << "\n"
<< "Use the second argumemt of InferenceEngine::Core::ReadNetwork"
<< " to load them from .bin part of the IR";
}
if (getParentEdges().size() != 1)
IE_THROW() << "Incorrect number of input edges for layer " << getName();
if (!getChildEdges().size())
IE_THROW() << "Incorrect number of output edges for layer " << getName();
eps = bnLayer->epsilon;
size_t variancesSize = MKLDNNDims(bnLayer->_weights->getTensorDesc().getDims()).size();
size_t meansSize = MKLDNNDims(bnLayer->_biases->getTensorDesc().getDims()).size();
if (variancesSize != meansSize && variancesSize != 1)
IE_THROW() << "Incorrect weights and biases sizes!";
internalBlobs.push_back(createInternalBlob(bnLayer->_weights->getTensorDesc().getDims(), true));
internalBlobs.push_back(createInternalBlob(bnLayer->_biases->getTensorDesc().getDims(), false));
auto parentOutDims = getParentEdgeAt(0)->getDims();
if (fusedWith.size() > 1)
IE_THROW() << "BatchNorm fusion is possible with only one layer!";
for (const auto &node : fusedWith) {
auto * scshLayer = dynamic_cast<ScaleShiftLayer*>(node->getCnnLayer().get());
if (scshLayer == nullptr)
IE_THROW() << "Cannot cast to the ScaleShift layer to fuse with BatchNorm.";
size_t C = static_cast<size_t>(getChildEdgeAt(0)->getDims()[1]);
SizeVector mkldnn_weights = {2, C};
TensorDesc desc(scshLayer->_weights->getTensorDesc().getPrecision(), mkldnn_weights, InferenceEngine::NC);
InferenceEngine::TBlob<float>::Ptr internalBlob = InferenceEngine::make_shared_blob<float>(desc);
internalBlob->allocate();
float * data = internalBlob->buffer();
if (data == nullptr)
IE_THROW() << "Cannot get memory!";
InferenceEngine::Blob::Ptr blb = scshLayer->_weights;
if (blb == nullptr)
IE_THROW() << "Cannot get weights blob for node " << getName() << ".";
size_t weightsByteSize = blb->byteSize();
cpu_memcpy_s(data, internalBlob->byteSize(), blb->buffer(), weightsByteSize);
data += blb->size();
blb = scshLayer->_biases;
if (blb == nullptr) {
memset(data, 0, weightsByteSize);
} else {
if (weightsByteSize != blb->byteSize())
IE_THROW() << "ScaleShift has incorrect weights!";
cpu_memcpy_s(data, internalBlob->byteSize(), blb->buffer(), weightsByteSize);
}
internalBlobs.push_back(internalBlob);
}
InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision();
if (precision != InferenceEngine::Precision::FP32)
precision = InferenceEngine::Precision::FP32;
auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
for (auto format : getAvailableFormatsForDims(parentOutDims)) {
MKLDNNMemoryDesc in_candidate(parentOutDims, inputDataType, format);
createDescriptor({in_candidate}, {});
}
}
static MKLDNNMemoryDesc get_bn_mdesc_by_index(const mkldnn::primitive_desc_iterator &primitive_desc, int idx) {
mkldnn_batch_normalization_desc_t *p;
error::wrap_c_api(mkldnn_primitive_desc_query(
primitive_desc.get(), mkldnn::convert_to_c(mkldnn::query::batch_normalization_d), 0, &p),
"could not get a batch-normalization descriptor");
auto bndesc =
(p->flags & mkldnn::convert_to_c(mkldnn::normalization_flags::use_global_stats)) ?
primitive_desc.src_desc(idx) : primitive_desc.dst_desc(idx);
return MKLDNNMemoryDesc {bndesc};
}
MKLDNNMemoryDesc MKLDNNBatchNormalizationNode::GetVarianceDesc(const mkldnn::primitive_desc &primitive_desc) const {
// TODO: rewrite with using stat_desc
return get_bn_mdesc_by_index(primitive_desc, 2);
}
MKLDNNMemoryDesc MKLDNNBatchNormalizationNode::GetMeanDesc(const mkldnn::primitive_desc &primitive_desc) const {
return get_bn_mdesc_by_index(primitive_desc, 1);
}
MKLDNNMemoryDesc MKLDNNBatchNormalizationNode::GetScaleShiftWeightsDesc(const mkldnn::primitive_desc &primitive_desc) const {
return MKLDNNMemoryDesc(primitive_desc.weights_desc(0));
}
bool MKLDNNBatchNormalizationNode::created() const {
return getType() == BatchNormalization;
}
void MKLDNNBatchNormalizationNode::createPrimitive() {
if (prim)
return;
auto prim_desc = createPrimitiveDescriptor<batch_normalization_forward::primitive_desc,
batch_normalization_forward::desc>();
prim.reset(new batch_normalization_forward(prim_desc));
auto src = getParentEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
auto dst = getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
const auto &mean = internalBlobMemory[1]->GetPrimitive();
const auto &var = internalBlobMemory[0]->GetPrimitive();
if (convert_to_c(flag) & dnnl_use_scaleshift) {
const auto &sclshft = internalBlobMemory[2]->GetPrimitive();
primArgs = {{DNNL_ARG_SRC, src},
{DNNL_ARG_MEAN, mean},
{DNNL_ARG_VARIANCE, var},
{DNNL_ARG_SCALE_SHIFT, sclshft},
{DNNL_ARG_DST, dst}};
} else {
primArgs = {{DNNL_ARG_SRC, src},
{DNNL_ARG_MEAN, mean},
{DNNL_ARG_VARIANCE, var},
{DNNL_ARG_DST, dst}};
}
}
void MKLDNNBatchNormalizationNode::createDescriptor(const std::vector<InferenceEngine::TensorDesc> &inputDesc,
const std::vector<InferenceEngine::TensorDesc> &outputDesc) {
MKLDNNMemoryDesc inDesc(inputDesc[0]);
if (inDesc.getDims().ndims() == 2) {
// Make it 4D
MKLDNNDims dims = inDesc.getDims();
dims.push_back(1); // H
dims.push_back(1); // W
auto format = memory::format_tag::nchw;
inDesc = MKLDNNMemoryDesc(dims, inDesc.getDataType(), format);
}
flag = normalization_flags::use_global_stats;
if (fusedWithScale())
flag |= normalization_flags::use_scale_shift;
MKLDNNDescriptor desc(std::shared_ptr<batch_normalization_forward::desc>(
new mkldnn::batch_normalization_forward::desc(prop_kind::forward_scoring, inDesc, eps,
flag)));
descs.push_back(desc);
}
void MKLDNNBatchNormalizationNode::initOptimalPrimitiveDescriptor() {
auto selected_pd = getSelectedPrimitiveDescriptor();
if (selected_pd == nullptr)
IE_THROW() << "Preferable primitive descriptor is not set.";
auto config = selected_pd->getConfig();
if (isInitConfig(config))
return;
if (config.inConfs.size() != 1 || config.outConfs.size() != 1 || (!isUninitTensorDesc(config.inConfs[0].desc) &&
!isUninitTensorDesc(config.outConfs[0].desc) && config.inConfs[0].desc != config.outConfs[0].desc))
IE_THROW() << "Layer " << getName() << " has incorrect selected config!";
if (!isUninitTensorDesc(config.inConfs[0].desc)) {
config.outConfs[0].desc = config.inConfs[0].desc;
} else if (!isUninitTensorDesc(config.outConfs[0].desc)) {
config.inConfs[0].desc = config.outConfs[0].desc;
} else {
config.outConfs[0].desc = config.inConfs[0].desc = getConfiguredInputDesc(config, 0);
}
initDescriptor(config);
}
void MKLDNNBatchNormalizationNode::initSupportedPrimitiveDescriptors() {
if (!supportedPrimitiveDescriptors.empty())
return;
// BN primitive doesn't support strides
for (auto& desc : descs) {
primitive_desc_iterator itpd = desc.createPrimitiveDescriptorIterator(getEngine());
while (static_cast<bool>(itpd)) {
InferenceEngine::LayerConfig config;
config.dynBatchSupport = true;
for (size_t i = 0; i < desc.inputNumbers(); i++) {
InferenceEngine::DataConfig dataConfig;
dataConfig.inPlace = -1;
dataConfig.constant = false;
dataConfig.desc = getSrcMemDesc(itpd, i);
config.inConfs.push_back(dataConfig);
}
for (size_t i = 0; i < desc.outputNumbers(); i++) {
InferenceEngine::DataConfig dataConfig;
dataConfig.inPlace = canBeInPlace() ? 0 : -1;
dataConfig.constant = false;
dataConfig.desc = getDstMemDesc(itpd, i);
config.outConfs.push_back(dataConfig);
}
impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());
supportedPrimitiveDescriptors.emplace_back(config, impl_type);
if (!itpd.next_impl())
break;
}
}
}
MKLDNNMemoryDesc MKLDNNBatchNormalizationNode::getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it,
size_t idx) {
TensorDesc desc = MKLDNNMemoryDesc(primitive_desc_it.src_desc(idx));
if (getParentEdgeAt(0)->getDims().ndims() == 2 && desc.getLayout() == InferenceEngine::Layout::NCHW) {
desc.reshape(getParentEdgeAt(idx)->getDims().ToSizeVector(), InferenceEngine::Layout::NC);
return MKLDNNMemoryDesc(desc);
}
if (desc.getLayout() == InferenceEngine::Layout::ANY)
return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
getParentEdgeAt(idx)->getDims().ToSizeVector(),
desc.getLayout()));
else
return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
getParentEdgeAt(idx)->getDims().ToSizeVector(),
desc.getBlockingDesc()));
}
MKLDNNMemoryDesc MKLDNNBatchNormalizationNode::getDstMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it,
size_t idx) {
TensorDesc desc = MKLDNNMemoryDesc(primitive_desc_it.dst_desc(idx));
if (getParentEdgeAt(0)->getDims().ndims() == 2 && desc.getLayout() == InferenceEngine::Layout::NCHW) {
desc.reshape(getParentEdgeAt(idx)->getDims().ToSizeVector(), InferenceEngine::Layout::NC);
return MKLDNNMemoryDesc(desc);
}
if (desc.getLayout() == InferenceEngine::Layout::ANY)
return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
getChildEdgeAt(idx)->getDims().ToSizeVector(),
desc.getLayout()));
else
return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
getChildEdgeAt(idx)->getDims().ToSizeVector(),
desc.getBlockingDesc()));
}
REG_MKLDNN_PRIM_FOR(MKLDNNBatchNormalizationNode, BatchNormalization);

View File

@ -1,44 +0,0 @@
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ie_common.h>
#include <mkldnn_node.h>
#include <memory>
#include <string>
#include <vector>
namespace MKLDNNPlugin {
class MKLDNNBatchNormalizationNode : public MKLDNNNode {
public:
MKLDNNBatchNormalizationNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng,
MKLDNNWeightsSharing::Ptr &cache);
~MKLDNNBatchNormalizationNode() override = default;
void initSupportedPrimitiveDescriptors() override;
void initOptimalPrimitiveDescriptor() override;
void getSupportedDescriptors() override;
void createDescriptor(const std::vector<InferenceEngine::TensorDesc>& inputDesc,
const std::vector<InferenceEngine::TensorDesc>& outputDesc) override;
void createPrimitive() override;
bool created() const override;
bool fusedWithScale() const {return fusedWith.size() == 1 && fusedWith[0]->getType() == Eltwise
&& fusedWith[0]->getCnnLayer()->type == "ScaleShift";}
MKLDNNMemoryDesc getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
MKLDNNMemoryDesc getDstMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
private:
mkldnn::normalization_flags flag = mkldnn::normalization_flags::none;
float eps = 0.0f;
MKLDNNMemoryDesc GetVarianceDesc(const mkldnn::primitive_desc& primitive_desc) const;
MKLDNNMemoryDesc GetMeanDesc(const mkldnn::primitive_desc& primitive_desc) const;
MKLDNNMemoryDesc GetScaleShiftWeightsDesc(const mkldnn::primitive_desc& primitive_desc) const;
};
} // namespace MKLDNNPlugin

View File

@ -6,20 +6,19 @@
#include "mkldnn_reorder_node.h"
#include "mkldnn_input_node.h"
#include "mkldnn_eltwise_node.h"
#include "mkldnn_quantize_node.h"
#include "mkldnn_fake_quantize_node.h"
#include "mkldnn_conv_node.h"
#include <legacy/ie_layers.h>
#include <string>
#include <vector>
#include <mkldnn_types.h>
#include <mkldnn_extension_utils.h>
#include <legacy/ie_layers_internal.hpp>
#include "ie_parallel.hpp"
#include "cpu/x64/jit_generator.hpp"
#include "cpu/x64/jit_uni_eltwise_injector.hpp"
#include "cpu/x64/jit_uni_depthwise_injector.hpp"
#include "cpu/x64/cpu_isa_traits.hpp"
#include "utils/general_utils.h"
#include <ngraph/opsets/opset1.hpp>
// WA for xbyak.h
#ifdef _WIN32
@ -873,17 +872,52 @@ private:
}
};
MKLDNNBinaryConvolutionNode::MKLDNNBinaryConvolutionNode(const InferenceEngine::CNNLayerPtr& layer,
bool MKLDNNBinaryConvolutionNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
try {
const auto binConv = std::dynamic_pointer_cast<const ngraph::opset1::BinaryConvolution>(op);
if (!binConv) {
errorMessage = "Only opset1 BinaryConvolution operation is supported";
return false;
}
if (binConv->get_mode() != ngraph::op::v1::BinaryConvolution::BinaryConvolutionMode::XNOR_POPCOUNT) {
errorMessage = "Doesn't support mode: " + ngraph::as_string(binConv->get_mode());
return false;
}
} catch (...) {
return false;
}
return true;
}
MKLDNNBinaryConvolutionNode::MKLDNNBinaryConvolutionNode(const std::shared_ptr<ngraph::Node>& op,
const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache)
: MKLDNNNode(layer, eng, cache) {
if (mayiuse(x64::avx512_common)) {
implType = impl_desc_type::jit_avx512;
} else if (mayiuse(x64::avx2)) {
implType = impl_desc_type::jit_avx2;
} else if (mayiuse(x64::sse41)) {
implType = impl_desc_type::jit_sse42;
: MKLDNNNode(op, eng, cache) {
std::string errorMessage;
if (isSupportedOperation(op, errorMessage)) {
errorPrefix = "BinaryConvolution node with name '" + getName() + "' ";
const auto binConv = std::dynamic_pointer_cast<const ngraph::opset1::BinaryConvolution>(op);
pad_value = binConv->get_pad_value();
for (int i = 0; i < binConv->get_strides().size(); i++) {
stride.push_back(static_cast<ptrdiff_t>(binConv->get_strides()[i]));
}
for (int i = 0; i < binConv->get_dilations().size(); i++) {
dilation.push_back(static_cast<ptrdiff_t>(binConv->get_dilations()[i]) - 1);
}
paddingL = binConv->get_pads_begin();
paddingR = binConv->get_pads_end();
if (mayiuse(x64::avx512_common)) {
implType = impl_desc_type::jit_avx512;
} else if (mayiuse(x64::avx2)) {
implType = impl_desc_type::jit_avx2;
} else if (mayiuse(x64::sse41)) {
implType = impl_desc_type::jit_sse42;
} else {
implType = impl_desc_type::ref;
}
} else {
implType = impl_desc_type::ref;
IE_THROW(NotImplemented) << errorMessage;
}
}
@ -891,28 +925,17 @@ void MKLDNNBinaryConvolutionNode::getSupportedDescriptors() {
if (!descs.empty())
return;
auto* binConvLayer = dynamic_cast<BinaryConvolutionLayer*>(getCnnLayer().get());
if (binConvLayer == nullptr)
IE_THROW() << "Cannot convert convolution layer.";
std::string errorPrefix = "BinaryConvolution layer with name '" + getName() + "' ";
withBinarization = isFusedWith(Quantize);
withBinarization = isFusedWith(FakeQuantize);
withSum = false;
int expectedInputEdgesNum = 2;
for (int i = 0; i < fusedWith.size(); i++) {
auto *eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(fusedWith[i].get());
if (eltwiseNode && eltwiseNode->isSum()) {
if (eltwiseNode && eltwiseNode->isSpecialConvolutionAddFusing()) {
withSum = true;
expectedInputEdgesNum++;
}
}
group = binConvLayer->_group;
if (group != 1) {
IE_THROW() << errorPrefix << "doesn't support parameter group != 1";
}
if (getParentEdges().size() != expectedInputEdgesNum)
IE_THROW() << errorPrefix << "has incorrect number of input edges";
@ -930,21 +953,6 @@ void MKLDNNBinaryConvolutionNode::getSupportedDescriptors() {
if (getChildEdgeAt(0)->getDims().ndims() != 4) {
IE_THROW() << errorPrefix << "doesn't support output with rank: " << getChildEdgeAt(0)->getDims().ndims();
}
if ((getParentEdgeAt(0)->getDims().ndims() < 4) || (getParentEdgeAt(0)->getDims().ndims() > 5)) {
IE_THROW() << "Convolution layer. Unsupported mode. Only 4D and 5D blobs are supported as input.";
}
pad_value = binConvLayer->_pad_value;
invertVectorCopyUtoI(binConvLayer->_stride, stride);
for (int i = 1; i <= binConvLayer->_dilation.size(); i++) {
dilation.push_back(static_cast<int>(binConvLayer->_dilation[binConvLayer->_dilation.size() - i]) - 1);
}
auto allPads = getPaddings(*binConvLayer);
invertVectorCopyUtoI(allPads.begin, paddingL);
invertVectorCopyUtoI(allPads.end, paddingR);
}
void MKLDNNBinaryConvolutionNode::initSupportedPrimitiveDescriptors() {
@ -1077,48 +1085,18 @@ void MKLDNNBinaryConvolutionNode::createPrimitive() {
}
bool MKLDNNBinaryConvolutionNode::canFuse(const MKLDNNNodePtr& node) const {
auto isOneOf = [](EltwiseOpType alg, std::vector<EltwiseOpType> algs) {
for (auto a : algs) {
if (alg == a) {
return true;
}
}
return false;
};
if (implType == impl_desc_type::ref)
return false;
// Binarization have to be last operation in fusing chain
if (isFusedWith(Quantize))
if (isFusedWith(FakeQuantize))
return false;
if (node->getType() == Quantize) {
auto* quantizeNode = dynamic_cast<MKLDNNQuantizeNode*>(node.get());
if (quantizeNode == nullptr)
IE_THROW() << "Cannot get quantize node " << node->getName();
return quantizeNode->isBinarization();
} else if (node->getType() == Eltwise) {
auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode*>(node.get());
if (eltwiseNode == nullptr)
IE_THROW() << "Cannot get eltwise node " << node->getName();
// Only one Add operation can be fused since it is implemented via output blob reuse
if (eltwiseNode->isSum()) {
for (auto& fusedNode : fusedWith) {
auto* fusedEltwiseNode = dynamic_cast<MKLDNNEltwiseNode*>(fusedNode.get());
if (fusedEltwiseNode->isSum()) {
return false;
}
}
}
return eltwiseNode->isSum() ||
isOneOf(eltwiseNode->getOpType(), {MulAdd, Prelu, Relu, Gelu, Elu, Logistic, BoundedRelu, Clamp, SoftRelu,
Tanh, Swish, Hswish, Mish, Hsigmoid, Round, Linear, Abs, Square, Sqrt});
if (node->getType() == FakeQuantize) {
return node->getAlgorithm() == FQBinarization;
} else {
return canFuseSimpleOperation(node);
}
return false;
}
void MKLDNNBinaryConvolutionNode::setPostOps(mkldnn::primitive_attr &attr) {
@ -1127,16 +1105,16 @@ void MKLDNNBinaryConvolutionNode::setPostOps(mkldnn::primitive_attr &attr) {
for (auto &node : fusedWith) {
auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
if (eltwiseNode) {
if (eltwiseNode->isSum())
if (eltwiseNode->isSpecialConvolutionAddFusing())
ops.append_sum(1.0);
else
eltwiseNode->appendPostOps(ops);
continue;
}
auto* quantizeNode = dynamic_cast<MKLDNNQuantizeNode *>(node.get());
if (quantizeNode) {
quantizeNode->appendPostOps(ops);
auto* fakeQuantizeNode = dynamic_cast<MKLDNNFakeQuantizeNode *>(node.get());
if (fakeQuantizeNode) {
fakeQuantizeNode->appendPostOps(ops);
continue;
}

View File

@ -74,7 +74,7 @@ struct jit_uni_bin_conv_kernel {
class MKLDNNBinaryConvolutionNode : public MKLDNNNode {
public:
MKLDNNBinaryConvolutionNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
MKLDNNBinaryConvolutionNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
~MKLDNNBinaryConvolutionNode() override = default;
void getSupportedDescriptors() override;
@ -86,7 +86,11 @@ public:
return false;
}
void setPostOps(mkldnn::primitive_attr &attr);
bool canFuse(const MKLDNNNodePtr& node) const;
bool canFuse(const MKLDNNNodePtr& node) const override;
static bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
impl_desc_type getImplType() { return implType; }
private:
bool withSum = false;
@ -112,6 +116,8 @@ private:
const std::vector<size_t>& s_str, const std::vector<size_t>& w_str, const std::vector<size_t>& d_str);
void executeReference(const uint8_t* src, const uint8_t* weights, uint8_t* dst,
const std::vector<size_t>& s_str, const std::vector<size_t>& w_str, const std::vector<size_t>& d_str);
std::string errorPrefix;
};
} // namespace MKLDNNPlugin

View File

@ -0,0 +1,133 @@
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <cmath>
#include <vector>
#include <string>
#include <mkldnn_types.h>
#include "ie_parallel.hpp"
#include "utils/bfloat16.hpp"
#include <mkldnn_selective_build.h>
#include "mkldnn_broadcast_node.h"
#include <nodes/common/tensor_desc_creator.h>
#include <ngraph/opsets/opset1.hpp>
#include "common/cpu_memcpy.h"
using namespace MKLDNNPlugin;
using namespace InferenceEngine;
bool MKLDNNBroadcastNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
try {
const auto broadcast = std::dynamic_pointer_cast<const ngraph::opset1::Broadcast>(op);
if (!broadcast) {
errorMessage = "Only opset1 Broadcast operation is supported";
return false;
}
if (broadcast->get_broadcast_spec() != ngraph::op::AutoBroadcastSpec::NUMPY) {
errorMessage = "Only NUMPY broadcast type is supported";
return false;
}
if (std::dynamic_pointer_cast<const ngraph::opset1::Constant>(broadcast->get_input_node_shared_ptr(BROADCAST_SHAPE)) == nullptr) {
errorMessage = "Only const 'shape' input is supported";
return false;
}
} catch (...) {
return false;
}
return true;
}
MKLDNNBroadcastNode::MKLDNNBroadcastNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng,
MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) {
std::string errorMessage;
if (!isSupportedOperation(op, errorMessage)) {
IE_THROW(NotImplemented) << errorMessage;
}
errorPrefix = "Broadcast node with name '" + op->get_friendly_name() + "'";
if (op->get_input_size() != 2 || op->get_output_size() != 1)
IE_THROW() << errorPrefix << " has incorrect number of input/output edges!";
SizeVector shape_dims = op->get_input_shape(BROADCAST_SHAPE);
if (shape_dims.size() > 1)
IE_THROW() << errorPrefix << " has incorrect 'shape' input rank: " << shape_dims.size();
}
void MKLDNNBroadcastNode::initSupportedPrimitiveDescriptors() {
if (!supportedPrimitiveDescriptors.empty())
return;
Precision prec = getOriginalInputPrecisionAtPort(BROADCAST_INPUT);
addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, prec},
{TensorDescCreatorTypes::ncsp, Precision::I32}},
{{TensorDescCreatorTypes::ncsp, prec}},
impl_desc_type::ref_any);
}
void MKLDNNBroadcastNode::execute(mkldnn::stream strm) {
size_t shape_size = (getParentEdgeAt(BROADCAST_SHAPE)->getDesc().getDims())[0];
SizeVector dst_dims = getChildEdgeAt(0)->getDesc().getDims();
SizeVector src_dims = getParentEdgeAt(BROADCAST_INPUT)->getDesc().getDims();
SizeVector srcStrides = getParentEdgeAt(BROADCAST_INPUT)->getDesc().getBlockingDesc().getStrides();
size_t data_size = getParentEdgeAt(BROADCAST_INPUT)->getDesc().getPrecision().size();
if (!src_dims.size())
src_dims = SizeVector(1, 1);
if (!srcStrides.size())
srcStrides = SizeVector(1, 1);
if (dst_dims.size() != shape_size) {
IE_THROW() << "Output tensor dimension mismatch";
}
if (src_dims.size() > dst_dims.size()) {
IE_THROW() << "Output tensor dimension is smaller then input tensor dimension";
}
InferenceEngine::SizeVector dstStrides = getChildEdgeAt(0)->getDesc().getBlockingDesc().getStrides();
InferenceEngine::SizeVector src_aligned(dst_dims.size());
InferenceEngine::SizeVector srcStrides_aligned(dst_dims.size());
size_t prefix_size = dst_dims.size() - src_dims.size();
for (size_t i = 0; i < dst_dims.size(); i++) {
if (i < prefix_size) {
src_aligned[i] = 1;
srcStrides_aligned[i] = srcStrides[0];
} else {
src_aligned[i] = src_dims[i - prefix_size];
srcStrides_aligned[i] = srcStrides[i - prefix_size];
}
}
size_t work_amount_dst = dstStrides[0] * dst_dims[0];
const auto *src_data = reinterpret_cast<const uint8_t *>(getParentEdgeAt(BROADCAST_INPUT)->getMemoryPtr()->GetPtr());
auto *dst_data = reinterpret_cast<uint8_t *>(getChildEdgeAt(0)->getMemoryPtr()->GetPtr());
parallel_nt(0, [&](const int ithr, const int nthr) {
size_t i, src_idx, start = 0, end = 0;
SizeVector counters(dst_dims.size(), 0);
splitter(work_amount_dst, nthr, ithr, start, end);
for (int j = dst_dims.size() - 1, i = start; j >= 0; j--) {
counters[j] = i % dst_dims[j];
i /= dst_dims[j];
}
for (size_t iwork = start * data_size; iwork < end * data_size; iwork += data_size) {
for (i = 0, src_idx = 0; i < dst_dims.size(); ++i)
src_idx += counters[i] ? ((counters[i] % src_aligned[i]) * srcStrides_aligned[i]) : 0;
cpu_memcpy(&dst_data[iwork], &src_data[src_idx * data_size], data_size);
for (int j = dst_dims.size() - 1; j >= 0; j--) {
counters[j] = (counters[j] + 1) % dst_dims[j];
if (counters[j] != 0) break;
}
}
});
}
bool MKLDNNBroadcastNode::created() const {
return getType() == Broadcast;
}
REG_MKLDNN_PRIM_FOR(MKLDNNBroadcastNode, Broadcast)

View File

@ -0,0 +1,35 @@
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ie_common.h>
#include <mkldnn_node.h>
#include <string>
#include <memory>
#include <vector>
namespace MKLDNNPlugin {
class MKLDNNBroadcastNode : public MKLDNNNode {
public:
MKLDNNBroadcastNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
~MKLDNNBroadcastNode() override = default;
void getSupportedDescriptors() override {};
void initSupportedPrimitiveDescriptors() override;
void createPrimitive() override {};
void execute(mkldnn::stream strm) override;
bool created() const override;
static bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
private:
static const size_t BROADCAST_INPUT = 0;
static const size_t BROADCAST_SHAPE = 1;
std::string errorPrefix;
};
} // namespace MKLDNNPlugin

View File

@ -9,7 +9,6 @@
#include <vector>
#include <mkldnn_extension_utils.h>
#include <legacy/ie_layers.h>
#include "mkldnn.hpp"
#include "mkldnn/iml_type_mapper.h"
#include "mkldnn_dims.h"
@ -17,7 +16,7 @@
#include "mkldnn_memory.h"
#include "ie_parallel.hpp"
#include "mkldnn_conv_node.h"
#include "mkldnn_quantize_node.h"
#include "mkldnn_fake_quantize_node.h"
#include "mkldnn_pooling_node.h"
#include "mkldnn_eltwise_node.h"
#include <limits>
@ -27,21 +26,37 @@ using namespace mkldnn;
using namespace MKLDNNPlugin;
using namespace InferenceEngine;
MKLDNNConcatNode::MKLDNNConcatNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache)
: MKLDNNNode(layer, eng, cache) {}
bool MKLDNNConcatNode::isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
try {
auto concatOp = ngraph::as_type_ptr<const ngraph::op::v0::Concat>(op);
if (!concatOp) {
errorMessage = "Node is not an instance of the Concat operation.";
return false;
}
} catch (...) {
return false;
}
return true;
}
MKLDNNConcatNode::MKLDNNConcatNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache)
: MKLDNNNode(op, eng, cache) {
std::string errorMessage;
if (!isSupportedOperation(op, errorMessage)) {
IE_THROW(NotImplemented) << errorMessage;
}
auto concatOp = ngraph::as_type_ptr<ngraph::op::v0::Concat>(op);
auto axis = concatOp->get_axis();
if (axis < 0) {
this->axis = concatOp->get_input_shape(0).size() + axis;
} else {
this->axis = axis;
}
}
void MKLDNNConcatNode::getSupportedDescriptors() {
auto * conLayer = dynamic_cast<ConcatLayer*>(getCnnLayer().get());
if (conLayer == nullptr)
IE_THROW() << "Cannot convert concat layer.";
axis = conLayer->_axis;
if (getParentEdges().empty())
IE_THROW() << "Incorrect number of input edges for layer " << getName();
if (getChildEdges().empty())
IE_THROW() << "Incorrect number of output edges for layer " << getName();
auto& firstParentDims = getParentEdgeAt(0)->getDims();
for (size_t i = 1; i < getParentEdges().size(); i++) {
auto& dims = getParentEdgeAt(i)->getDims();
@ -64,10 +79,11 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
if (!supportedPrimitiveDescriptors.empty())
return;
inputPrecision = getCnnLayer()->insData[0].lock()->getPrecision();
auto& originInputPrecisions = getOriginalInputPrecisions();
inputPrecision = originInputPrecisions[0];
bool isMixedPrecision = false;
for (int i = 1; i < getCnnLayer()->insData.size(); i++) {
if (getCnnLayer()->insData[0].lock()->getPrecision() != getCnnLayer()->insData[i].lock()->getPrecision()) {
for (int i = 1; i < getOriginalInputsNumber(); i++) {
if (originInputPrecisions[0] != originInputPrecisions[i]) {
isMixedPrecision = true;
break;
}

View File

@ -13,9 +13,10 @@ namespace MKLDNNPlugin {
class MKLDNNConcatNode : public MKLDNNNode {
public:
MKLDNNConcatNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
MKLDNNConcatNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
~MKLDNNConcatNode() override = default;
static bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept;
void getSupportedDescriptors() override;
void initSupportedPrimitiveDescriptors() override;
void initOptimalPrimitiveDescriptor() override;

View File

@ -6,97 +6,123 @@
#include "mkldnn_reorder_node.h"
#include "mkldnn_input_node.h"
#include "mkldnn_eltwise_node.h"
#include "mkldnn_quantize_node.h"
#include "mkldnn_fake_quantize_node.h"
#include "mkldnn_pooling_node.h"
#include "mkldnn_concat_node.h"
#include <legacy/ie_layers.h>
#include <string>
#include <vector>
#include <mkldnn_types.h>
#include <mkldnn_extension_utils.h>
#include <legacy/ie_layers_internal.hpp>
#include <utils/general_utils.h>
#include <ngraph/ops.hpp>
using namespace mkldnn;
using namespace MKLDNNPlugin;
using namespace InferenceEngine;
MKLDNNConvolutionNode::MKLDNNConvolutionNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache)
: MKLDNNNode(layer, eng, cache), withBiases(false), withSum(false), withDWConv(false), isDW(false), isMerged(false),
bool MKLDNNConvolutionNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
try {
if (!ngraph::is_type<ngraph::op::v1::Convolution>(op) && !ngraph::is_type<ngraph::op::v1::GroupConvolution>(op)) {
errorMessage = "Only opset1 Convolution and GroupConvolution operations are supported";
return false;
}
size_t ndims = op->get_input_shape(0).size();
if ((ndims < 4) || (ndims > 5)) {
errorMessage = "Doesn't support 'data' input with rank: " + std::to_string(ndims);
return false;
}
} catch (...) {
return false;
}
return true;
}
MKLDNNConvolutionNode::MKLDNNConvolutionNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache)
: MKLDNNNode(op, eng, cache), withBiases(false), withSum(false), withDWConv(false),
isGrouped(false), dw_conv_oc(0), dw_conv_ih(0), dw_conv_iw(0), dw_conv_in_dt(memory::data_type::undef),
groupNum(1lu), baseInputsNumber(1), eltwisePrecision(Precision::FP32) {
internalBlobDesc.emplace_back([&](primitive_desc_iterator &primitive_desc_it, size_t idx) -> MKLDNNMemoryDesc {
return MKLDNNMemoryDesc(primitive_desc_it.weights_desc(0));
});
internalBlobDesc.emplace_back([&](primitive_desc_iterator &primitive_desc_it, size_t idx) -> MKLDNNMemoryDesc {
if (!withBiases)
return MKLDNNMemoryDesc();
return MKLDNNMemoryDesc(primitive_desc_it.weights_desc(1));
});
auto ws = layer->blobs.find("w-scale");
if (ws != layer->blobs.end()) {
wScale = ws->second;
groupNum(1lu), eltwisePrecision(Precision::FP32) {
std::string errorMessage;
if (!isSupportedOperation(op, errorMessage)) {
IE_THROW(NotImplemented) << errorMessage;
}
// Trying to find oi-scale
if (getCnnLayer()->type == "Convolution" && getCnnLayer()->precision == Precision::I8) {
auto ois = layer->blobs.find("oi-scale");
if ((getCnnLayer()->outData[0]->getPrecision() == Precision::I8 || getCnnLayer()->outData[0]->getPrecision() == Precision::U8)
&& ois == layer->blobs.end()) {
IE_THROW() << "Internal error of graph quantization - mismatch of intermediate scales and next layer type for convolution "
<< getCnnLayer()->name;
}
if (ois != layer->blobs.end()) {
// If we can find an oi-scale, then the next layer has to be an INT8.
oScale = ois->second;
}
}
isPrimitivesPriorityDefined = op->get_rt_info().count("PrimitivesPriority") != 0;
if (getCnnLayer()->type == "Convolution") {
baseInputsNumber = getCnnLayer().get()->insData.size();
auto convolutionOp = ngraph::as_type_ptr<ngraph::op::v1::Convolution>(op);
auto groupConvolutionOp = ngraph::as_type_ptr<ngraph::op::v1::GroupConvolution>(op);
if (convolutionOp) {
algorithm = ConvolutionCommon;
groupNum = 1;
isGrouped = false;
weightDims = convolutionOp->input_value(1).get_shape();
IC = weightDims[1];
groupIC = IC;
groupOC = weightDims[0];
biasesDims = { groupOC };
for (int i = 0; i < convolutionOp->get_strides().size(); i++) {
stride.push_back(static_cast<ptrdiff_t>(convolutionOp->get_strides()[i]));
}
for (int i = 0; i < convolutionOp->get_dilations().size(); i++) {
dilation.push_back(static_cast<ptrdiff_t>(convolutionOp->get_dilations()[i]) - 1);
}
paddingL = convolutionOp->get_pads_begin();
paddingR = convolutionOp->get_pads_end();
} else if (groupConvolutionOp) {
algorithm = ConvolutionGrouped;
groupNum = groupConvolutionOp->input_value(1).get_shape()[0];
isGrouped = true;
weightDims = groupConvolutionOp->input_value(1).get_shape();
groupIC = weightDims[2];
IC = groupIC * groupNum;
groupOC = weightDims[1];
biasesDims = {groupOC * groupNum};
for (int i = 0; i < groupConvolutionOp->get_strides().size(); i++) {
stride.push_back(static_cast<ptrdiff_t>(groupConvolutionOp->get_strides()[i]));
}
for (int i = 0; i < groupConvolutionOp->get_dilations().size(); i++) {
dilation.push_back(static_cast<ptrdiff_t>(groupConvolutionOp->get_dilations()[i]) - 1);
}
paddingL = groupConvolutionOp->get_pads_begin();
paddingR = groupConvolutionOp->get_pads_end();
}
}
mkldnn::memory::data_type MKLDNNConvolutionNode::precisionToDataType(InferenceEngine::Precision prec) {
// MKLDNN Plugin doesn't support U16 layout so upcast to FP32 in this case
if (prec == Precision::U16)
prec = Precision::FP32;
bool MKLDNNConvolutionNode::canBeExecutedInInt8() const {
auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(getOriginalInputPrecisionAtPort(0));
if (!inputZeroPoints.empty())
inputDataType = memory::data_type::u8;
return MKLDNNExtensionUtils::IEPrecisionToDataType(prec);
auto weightsDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(getOriginalInputPrecisionAtPort(1));
if (!weightsZeroPoints.empty())
weightsDataType = memory::data_type::s8;
return one_of(inputDataType, memory::data_type::u8, memory::data_type::s8) && weightsDataType == memory::data_type::s8;
}
bool MKLDNNConvolutionNode::canBeExecutedInInt8() {
auto * convLayer = dynamic_cast<ConvolutionLayer*>(getCnnLayer().get());
if (convLayer == nullptr)
IE_THROW() << "Cannot convert convolution layer.";
if (baseInputsNumber > 1) {
auto inputDataType = precisionToDataType(getCnnLayer()->insData[0].lock()->getPrecision());
if (!inputZeroPoints.empty())
inputDataType = memory::data_type::u8;
auto weightsDataType = precisionToDataType(Precision::FP32);
if (baseInputsNumber > 1) {
weightsDataType = precisionToDataType(getCnnLayer()->insData[1].lock()->getPrecision());
if (!weightsZeroPoints.empty())
weightsDataType = memory::data_type::s8;
}
return (inputDataType == mkldnn_s8 || inputDataType == mkldnn_u8) && weightsDataType == mkldnn_s8;
} else {
return this->getCnnLayer()->precision == Precision::I8;
}
}
InferenceEngine::Precision MKLDNNConvolutionNode::fusedEltwisePrecision(MKLDNNEltwiseNode *eltwiseNode, int findex) {
InferenceEngine::Precision MKLDNNConvolutionNode::fusedEltwisePrecision(const MKLDNNNodePtr& fusingNode) const {
InferenceEngine::Precision eltwisePrecision;
auto parent0 = getCreatorLayer(eltwiseNode->getCnnLayer()->insData[0].lock()).lock();
auto parent1 = getCreatorLayer(eltwiseNode->getCnnLayer()->insData[1].lock()).lock();
auto fusedParent = findex != 0 ? fusedWith[findex - 1].get()->getCnnLayer() : this->getCnnLayer();
eltwisePrecision = fusedParent == parent0 ? eltwiseNode->getCnnLayer()->insData[1].lock()->getPrecision() :
eltwiseNode->getCnnLayer()->insData[0].lock()->getPrecision();
int fusingPort = fusingNode->getFusingPort();
if (fusingPort == 0) {
eltwisePrecision = fusingNode->getOriginalInputPrecisionAtPort(1);
} else if (fusingPort == 1) {
eltwisePrecision = fusingNode->getOriginalInputPrecisionAtPort(0);
} else {
IE_THROW() << "Cannot determine Eltwise post op precision for Convolution node with name '" << getName() << "'";
}
return eltwisePrecision;
}
@ -104,47 +130,43 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
if (!descs.empty())
return;
auto * convLayer = dynamic_cast<ConvolutionLayer*>(getCnnLayer().get());
if (convLayer == nullptr)
IE_THROW() << "Cannot convert convolution layer.";
withBiases = getOriginalInputsNumber() == 3;
withSum = false;
int expectedInputEdgesNum = baseInputsNumber;
int expectedInputEdgesNum = static_cast<int>(getOriginalInputsNumber());
for (int i = 0; i < fusedWith.size(); i++) {
auto *convolutionNode = dynamic_cast<MKLDNNConvolutionNode *>(fusedWith[i].get());
if (convolutionNode) {
expectedInputEdgesNum += convolutionNode->getBaseIntputsNumber() - 1;
if (fusedWith[i]->getType() == Convolution) {
expectedInputEdgesNum += static_cast<int>(fusedWith[i]->getOriginalInputsNumber()) - 1;
}
auto *eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(fusedWith[i].get());
if (eltwiseNode && eltwiseNode->isSum()) {
withSum = true;
expectedInputEdgesNum++;
if (fusedWith[i]->getAlgorithm() == EltwiseAdd) {
auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(fusedWith[i].get());
if (eltwiseNode && eltwiseNode->isSpecialConvolutionAddFusing()) {
withSum = true;
expectedInputEdgesNum++;
}
}
}
auto inputDataType = precisionToDataType(getCnnLayer()->insData[0].lock()->getPrecision());
auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(getOriginalInputPrecisionAtPort(0));
if (!inputZeroPoints.empty())
inputDataType = memory::data_type::u8;
auto outputDataType = precisionToDataType(getCnnLayer()->outData[0]->getPrecision());
auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(getOriginalOutputPrecisionAtPort(0));
eltwisePrecision = MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType);
if (baseInputsNumber > 1) {
if (!fusedWith.empty()) {
auto lastFusedLayer = fusedWith[fusedWith.size() - 1].get()->getCnnLayer();
if (lastFusedLayer) {
outputDataType = precisionToDataType(lastFusedLayer->outData[0]->getPrecision());
eltwisePrecision = MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType);
}
}
if (!fusedWith.empty()) {
outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(fusedWith[fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0));
eltwisePrecision = MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType);
}
// We need to make sure that convolution output and second input of fused Eltwise operation
// have equal precision sizes since they use the same physical memory. In case precisions are different we upscale to FP32.
if (outputDataType != memory::data_type::f32 && outputDataType != memory::data_type::bf16 && withSum) {
for (int i = 0; i < fusedWith.size(); i++) {
auto *eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(fusedWith[i].get());
if (eltwiseNode && eltwiseNode->isSum()) {
eltwisePrecision = fusedEltwisePrecision(eltwiseNode, i);
// We need to make sure that convolution output and second input of fused Eltwise operation
// have equal precision sizes since they use the same physical memory. In case precisions are different we upscale to FP32.
if (outputDataType != memory::data_type::f32 && outputDataType != memory::data_type::bf16 && withSum) {
for (int i = 0; i < fusedWith.size(); i++) {
if (fusedWith[i]->getAlgorithm() == EltwiseAdd) {
auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(fusedWith[i].get());
if (eltwiseNode && eltwiseNode->isSpecialConvolutionAddFusing()) {
eltwisePrecision = fusedEltwisePrecision(fusedWith[i]);
if (MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType).size() != eltwisePrecision.size()) {
eltwisePrecision = Precision::FP32;
outputDataType = memory::data_type::f32;
@ -160,81 +182,7 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
if (getChildEdges().empty())
IE_THROW() << "Incorrect number of output edges for layer " << getName();
if ((getParentEdgeAt(0)->getDims().ndims() < 4) || (getParentEdgeAt(0)->getDims().ndims() > 5)) {
IE_THROW() << "Convolution layer. Unsupported mode. Only 4D and 5D blobs are supported as input.";
}
isMerged = (!getMergeWith().empty()); // grouped convolution was constructed from split->concat subgraph
isGrouped = convLayer->_group != 1; // group info available from IR
if (isMerged && isGrouped)
IE_THROW() << "Convolution initialization. Group splitted mode are used together with direct group specification.";
// default values. Can be replaced in next steps
groupNum = convLayer->_group;
size_t IC = convLayer->input()->getDims()[1];
size_t groupIC = IC;
size_t groupOC = convLayer->_out_depth;
isDW = groupNum == groupOC && groupNum == groupIC;
if (isMerged) {
groupNum = getMergeWith().size() + 1;
}
if (isGrouped) {
groupIC /= groupNum;
groupOC /= groupNum;
}
weightDims.clear();
weightDims.push_back(groupOC);
weightDims.push_back(groupIC);
for (int i = 1; i <= convLayer->_kernel.size(); i++) {
weightDims.push_back(convLayer->_kernel[convLayer->_kernel.size() - i]);
}
biasesDims = { groupOC * groupNum };
if (isGrouped || isMerged) weightDims.insert(weightDims.begin(), groupNum);
withBiases = (convLayer->_biases != nullptr && convLayer->_biases->size() != 0) || baseInputsNumber == 3;
if (baseInputsNumber == 1) {
internalBlobs.push_back(createInternalBlob(weightDims, true, isGrouped));
if (withBiases) {
internalBlobs.push_back(createInternalBlob(biasesDims, false));
}
Blob::Ptr weights = this->getCnnLayer()->blobs.find("weights")->second;
if (weights->getTensorDesc().getPrecision() == Precision::I8) {
// The weights blob has incorrect dims, so we have to fix it
TensorDesc wdesc = internalBlobs[0]->getTensorDesc();
wdesc.setPrecision(Precision::I8);
InferenceEngine::TBlob<int8_t>::Ptr reshapedInt8Weights =
InferenceEngine::TBlob<int8_t>::Ptr(
new InferenceEngine::TBlob<int8_t>(wdesc, static_cast<int8_t*>(weights->buffer()), weights->byteSize()));
internalBlobs[0] = reshapedInt8Weights;
if (withBiases) {
Blob::Ptr biases = this->getCnnLayer()->blobs.find("biases")->second;
TensorDesc bdesc = internalBlobs[1]->getTensorDesc();
bdesc.setPrecision(Precision::I32);
InferenceEngine::TBlob<int32_t>::Ptr reshapedInt32Biases =
InferenceEngine::TBlob<int32_t>::Ptr(
new InferenceEngine::TBlob<int32_t>(bdesc, static_cast<int32_t*>(biases->buffer()), biases->byteSize()));
internalBlobs[1] = reshapedInt32Biases;
}
}
}
invertVectorCopyUtoI(convLayer->_stride, stride);
for (int i = 1; i <= convLayer->_dilation.size(); i++) {
dilation.push_back(static_cast<int>(convLayer->_dilation[convLayer->_dilation.size() - i]) - 1);
}
auto allPads = getPaddings(*convLayer);
invertVectorCopyUtoI(allPads.begin, paddingL);
invertVectorCopyUtoI(allPads.end, paddingR);
int ndims = getParentEdgesAtPort(0)[0]->getDims().ndims();
MKLDNNDims weightsDims = MKLDNNDims(weightDims);
withDWConv = isFusedWith(Convolution);
@ -242,29 +190,26 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
for (int i = 0; i < fusedWith.size(); i++) {
auto *convolutionNode = dynamic_cast<MKLDNNConvolutionNode *>(fusedWith[i].get());
if (convolutionNode) {
auto *convLayer = reinterpret_cast<ConvolutionLayer *>(convolutionNode->getCnnLayer().get());
dw_conv_ih = convolutionNode->inDims[0][convolutionNode->inDims[0].ndims() - 2];
dw_conv_iw = convolutionNode->inDims[0][convolutionNode->inDims[0].ndims() - 1];
dw_conv_oc = convLayer->_out_depth;
for (int j = 0; j < convLayer->_kernel.size(); j++) {
dw_conv_kernel.push_back(convLayer->_kernel[j]);
}
for (int j = 0; j < convLayer->_stride.size(); j++) {
dw_conv_strides.push_back(convLayer->_stride[j]);
}
dw_conv_oc = convolutionNode->outDims[0][1];
const auto &dwWeightsDims = convolutionNode->inDims[1].ToSizeVector();
dw_conv_kernel.push_back(dwWeightsDims[dwWeightsDims.size() - 1]);
dw_conv_kernel.push_back(dwWeightsDims[dwWeightsDims.size() - 2]);
dw_conv_strides = convolutionNode->getStride();
if (canBeExecutedInInt8()) {
if (i == 0) {
dw_conv_in_dt = precisionToDataType(getCnnLayer()->outData[0]->getPrecision());
dw_conv_in_dt = MKLDNNExtensionUtils::IEPrecisionToDataType(getOriginalOutputPrecisionAtPort(0));
} else {
dw_conv_in_dt = precisionToDataType(fusedWith[i - 1].get()->getCnnLayer()->outData[0]->getPrecision());
dw_conv_in_dt = MKLDNNExtensionUtils::IEPrecisionToDataType(fusedWith[i - 1]->getOriginalOutputPrecisionAtPort(0));
}
} else {
dw_conv_in_dt = memory::data_type::f32;
}
for (int j = 0; j < paddingR.size(); j++) {
int with_group = (isGrouped || isMerged) ? 1 : 0;
int with_group = isGrouped ? 1 : 0;
int krn = weightsDims[with_group + 2 + j];
int src = getParentEdgeAt(0)->getDims()[2 + j];
int dst = getChildEdgeAt(0)->getDims()[2 + j];
@ -283,30 +228,32 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
outputDataType = memory::data_type::f32;
if (eltwisePrecision == Precision::BF16)
eltwisePrecision = Precision::FP32;
in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType,
getParentEdgeAt(0)->getDims().ndims() == 5 ? memory::format_tag::ndhwc : memory::format_tag::nhwc);
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType,
getParentEdgeAt(0)->getDims().ndims() == 5 ? memory::format_tag::ndhwc : memory::format_tag::nhwc);
in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, ndims == 5 ? memory::format_tag::ndhwc
: memory::format_tag::nhwc);
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, ndims == 5 ? memory::format_tag::ndhwc
: memory::format_tag::nhwc);
createDescriptor({in_candidate}, {out_candidate});
} else {
inputDataType = (convLayer->input()->getPrecision() == Precision::BF16
&& !(isGrouped && getParentEdgeAt(0)->getDims().ndims() == 5)) ? memory::data_type::bf16 : memory::data_type::f32;
outputDataType = (convLayer->outData[0]->getPrecision() == Precision::BF16
&& !(isGrouped && getParentEdgeAt(0)->getDims().ndims() == 5)) ? memory::data_type::bf16 : memory::data_type::f32;
inputDataType = (getOriginalInputPrecisionAtPort(0) == Precision::BF16 && !(isGrouped && ndims == 5)) ? memory::data_type::bf16
: memory::data_type::f32;
outputDataType = (getOriginalOutputPrecisionAtPort(0) == Precision::BF16 && !(isGrouped && ndims == 5)) ? memory::data_type::bf16
: memory::data_type::f32;
eltwisePrecision = Precision::FP32;
for (int i = 0; i < fusedWith.size(); i++) {
auto *eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(fusedWith[i].get());
if (eltwiseNode && eltwiseNode->isSum()) {
eltwisePrecision = fusedEltwisePrecision(eltwiseNode, i);
// TODO(amalyshe): there might be situation when convolution can be executed in BF16,
// output is required in FP32 but eltwise inplace tensor would be in BF16
// currently we forcedly change output to the BF16 that will add reoreder after the node
// Another situation can be when we mark output as FP32 and Eltwise asPrecison (which stand
// for input of inplace tensor precision) to FP32. This will add reorder for that in-place tensor
// bofore the fused convolution. This behaviour might be more correct regarding expected markup
// of the graph but performance of first and second approaches might be different. Need to verify
outputDataType = eltwisePrecision == Precision::BF16 ? memory::data_type::bf16 : memory::data_type::f32;
eltwisePrecision = MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType);
if (fusedWith[i]->getAlgorithm() == EltwiseAdd) {
auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(fusedWith[i].get());
if (eltwiseNode && eltwiseNode->isSpecialConvolutionAddFusing()) {
eltwisePrecision = fusedEltwisePrecision(fusedWith[i]);
// TODO(amalyshe): there might be situation when convolution can be executed in BF16,
// output is required in FP32 but eltwise inplace tensor would be in BF16
// currently we forcedly change output to the BF16 that will add reoreder after the node
// Another situation can be when we mark output as FP32 and Eltwise asPrecison (which stand
// for input of inplace tensor precision) to FP32. This will add reorder for that in-place tensor
// bofore the fused convolution. This behaviour might be more correct regarding expected markup
// of the graph but performance of first and second approaches might be different. Need to verify
outputDataType = eltwisePrecision == Precision::BF16 ? memory::data_type::bf16 : memory::data_type::f32;
eltwisePrecision = MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType);
}
}
}
// correction for cases of FP32 input - we do not have FP32 convolution supported BF16 output
@ -316,16 +263,13 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
eltwisePrecision = Precision::FP32;
}
Layout layout = convLayer->input()->getLayout();
if (layout == NCHW || layout == NHWC) {
if (ndims == 4) {
if (IC == 1 && groupOC == 1) {
in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nchw);
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nchw);
createDescriptor({in_candidate}, {out_candidate});
} else if (IC == 3 || IC == 1) {
in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType,
layout == NCHW ? memory::format_tag::nchw : memory::format_tag::nhwc);
in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nchw);
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nChw16c);
createDescriptor({in_candidate}, {out_candidate});
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nChw8c);
@ -339,19 +283,16 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
createDescriptor({in_candidate}, {out_candidate});
}
in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType,
layout == NCHW ? memory::format_tag::nchw : memory::format_tag::nhwc);
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType,
layout == NCHW ? memory::format_tag::nchw : memory::format_tag::nhwc);
in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nchw);
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nchw);
createDescriptor({in_candidate}, {out_candidate});
} else if (layout == NCDHW || layout == NDHWC) {
} else if (ndims == 5) {
if (IC == 1 && groupOC == 1) {
in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::ncdhw);
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::ncdhw);
createDescriptor({in_candidate}, {out_candidate});
} else if (IC == 3 || IC == 1) {
in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType,
layout == NCDHW ? memory::format_tag::ncdhw : memory::format_tag::ndhwc);
in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::ncdhw);
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nCdhw16c);
createDescriptor({in_candidate}, {out_candidate});
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nCdhw8c);
@ -365,17 +306,14 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
createDescriptor({in_candidate}, {out_candidate});
}
in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType,
layout == NCDHW ? memory::format_tag::ncdhw : memory::format_tag::ndhwc);
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType,
layout == NCDHW ? memory::format_tag::ncdhw : memory::format_tag::ndhwc);
in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::ncdhw);
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::ncdhw);
createDescriptor({in_candidate}, {out_candidate});
}
}
}
void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWeights = false) {
int blob_idx = 0;
void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWeights = false) const {
mkldnn::post_ops ops;
for (auto &node : fusedWith) {
@ -383,66 +321,31 @@ void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWe
continue;
auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
if (eltwiseNode && eltwiseNode->isSum()) {
ops.append_sum(1.0, precisionToDataType(eltwisePrecision));
continue;
}
if (eltwiseNode) {
eltwiseNode->appendPostOps(ops);
if (eltwiseNode->isSpecialConvolutionAddFusing())
ops.append_sum(1.0, MKLDNNExtensionUtils::IEPrecisionToDataType(eltwisePrecision));
else
eltwiseNode->appendPostOps(ops);
continue;
}
auto* quantizeNode = dynamic_cast<MKLDNNQuantizeNode *>(node.get());
if (quantizeNode) {
quantizeNode->appendPostOps(ops);
auto* fakeQuantizeNode = dynamic_cast<MKLDNNFakeQuantizeNode *>(node.get());
if (fakeQuantizeNode) {
fakeQuantizeNode->appendPostOps(ops);
continue;
}
auto* convolutionNode = dynamic_cast<MKLDNNConvolutionNode *>(node.get());
if (convolutionNode) {
if (initWeights) {
if (convolutionNode->getBaseIntputsNumber() == 1) {
auto* convLayer = reinterpret_cast<ConvolutionLayer*>(convolutionNode->getCnnLayer().get());
auto weightsPrc = precisionToDataType(convLayer->precision);
auto biasPrc = memory::data_type::s32;
PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
MKLDNNDims dwWeightsDims({dw_conv_oc, (ptrdiff_t)1, (ptrdiff_t)1, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS]});
PostOpsIntBlobMemory[blob_idx]->Create(dwWeightsDims, weightsPrc, memory::format_tag::Goihw8g);
PostOpsIntBlobMemory[blob_idx]->FillZero();
Blob::Ptr weights = convLayer->blobs.find("weights")->second;
Blob::Ptr biases = convLayer->blobs.find("biases")->second;
PostOpsIntBlobMemory[blob_idx]->SetData(weightsPrc, memory::format_tag::goihw, weights->buffer(),
dwWeightsDims.size() * MKLDNNExtensionUtils::sizeOfDataType(weightsPrc));
PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
MKLDNNDims dwBiasesDims({dw_conv_oc});
PostOpsIntBlobMemory[blob_idx + 1]->Create(dwBiasesDims, biasPrc, memory::format_tag::x);
PostOpsIntBlobMemory[blob_idx + 1]->FillZero();
PostOpsIntBlobMemory[blob_idx + 1]->SetData(biasPrc, memory::format_tag::x, biases->buffer(),
dwBiasesDims.size() * MKLDNNExtensionUtils::sizeOfDataType(biasPrc));
// todo: rewrite onto append_dw_k3s2p1
ops.append_dw_conv(dw_conv_ih, dw_conv_iw, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS],
dw_conv_strides[Y_AXIS], dw_conv_strides[X_AXIS],
mkldnn::memory::convert_to_c(dw_conv_in_dt),
static_cast<const float *>(PostOpsIntBlobMemory[blob_idx]->GetData()),
static_cast<const float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData()));
blob_idx += 2;
} else {
// todo: rewrite onto append_dw_k3s2p1
ops.append_dw_conv(dw_conv_ih, dw_conv_iw, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS],
dw_conv_strides[Y_AXIS], dw_conv_strides[X_AXIS],
mkldnn::memory::convert_to_c(dw_conv_in_dt),
static_cast<const float *>(getParentEdgeAt(
baseInputsNumber + 0)->getMemory().GetData()),
static_cast<const float *>(getParentEdgeAt(
baseInputsNumber + 1)->getMemory().GetData()));
}
// todo: rewrite onto append_dw_k3s2p1
ops.append_dw_conv(dw_conv_ih, dw_conv_iw, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS],
dw_conv_strides[Y_AXIS], dw_conv_strides[X_AXIS],
mkldnn::memory::convert_to_c(dw_conv_in_dt),
static_cast<const float *>(getParentEdgeAt(
getOriginalInputsNumber() + 0)->getMemory().GetData()),
static_cast<const float *>(getParentEdgeAt(
getOriginalInputsNumber() + 1)->getMemory().GetData()));
} else {
// todo: rewrite onto append_dw_k3s2p1
ops.append_dw_conv(dw_conv_ih, dw_conv_iw, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS],
@ -451,47 +354,6 @@ void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWe
nullptr,
nullptr);
}
if (convolutionNode->wScale != nullptr) {
float* wScaleData = static_cast<float*>(convolutionNode->wScale->buffer());
std::vector<float> oScaleDataVector;
std::vector<float> oShiftDataVector;
if (convolutionNode->getCnnLayer()->precision == Precision::I8 &&
convolutionNode->getCnnLayer()->outData[0]->getPrecision() != Precision::FP32) {
float *oScaleData = static_cast<float *>(convolutionNode->oScale->buffer());
for (size_t c = 0; c < convolutionNode->wScale->size(); c++) {
oScaleDataVector.push_back(wScaleData[c] / oScaleData[c]);
oShiftDataVector.push_back(0.f);
}
} else {
for (size_t c = 0; c < convolutionNode->wScale->size(); c++) {
oScaleDataVector.push_back(wScaleData[c]);
oShiftDataVector.push_back(0.f);
}
}
MKLDNNDims oScaleDims({static_cast<ptrdiff_t>(rnd_up(biasesDims[0], 16))});
PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
PostOpsIntBlobMemory[blob_idx]->Create(oScaleDims, memory::data_type::f32, memory::format_tag::x);
PostOpsIntBlobMemory[blob_idx]->FillZero();
PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::format_tag::x, &oScaleDataVector[0],
oScaleDataVector.size() * MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
PostOpsIntBlobMemory[blob_idx + 1]->Create(oScaleDims, memory::data_type::f32, memory::format_tag::x);
PostOpsIntBlobMemory[blob_idx + 1]->FillZero();
PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::format_tag::x, &oShiftDataVector[0],
oShiftDataVector.size() * MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
ops.append_depthwise(mkldnn::algorithm::depthwise_scale_shift,
static_cast<const float *>(PostOpsIntBlobMemory[blob_idx]->GetData()),
static_cast<const float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData()));
blob_idx += 2;
}
continue;
}
@ -528,8 +390,8 @@ void MKLDNNConvolutionNode::initSupportedPrimitiveDescriptors() {
config.inConfs.push_back(dataConfig);
}
if (withDWConv && baseInputsNumber > 1) {
auto weightsPrc = precisionToDataType(dw_conv_in_dt == mkldnn_u8 ? Precision::I8 : Precision::FP32);
if (withDWConv) {
auto weightsPrc = MKLDNNExtensionUtils::IEPrecisionToDataType(dw_conv_in_dt == mkldnn_u8 ? Precision::I8 : Precision::FP32);
auto biasPrc = memory::data_type::f32;
MKLDNNDims dwWeightsDims({dw_conv_oc, (ptrdiff_t)1, (ptrdiff_t)1, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS]});
@ -553,7 +415,7 @@ void MKLDNNConvolutionNode::initSupportedPrimitiveDescriptors() {
dataConfig.constant = false;
dataConfig.desc = getDstMemDesc(itpd, i);
if (!(isGrouped || isMerged))
if (!isGrouped)
dataConfig.desc = MKLDNNExtensionUtils::getUninitTensorDesc(dataConfig.desc);
config.outConfs.push_back(dataConfig);
@ -582,7 +444,6 @@ void MKLDNNConvolutionNode::createPrimitive() {
mkldnn::primitive_attr attr;
addZeroPoints(attr);
setPostOps(attr, true);
addScaleToPrimitiveAttr(attr);
auto prim_desc = createPrimitiveDescriptor<convolution_forward::primitive_desc,
convolution_forward::desc>(attr);
@ -590,11 +451,14 @@ void MKLDNNConvolutionNode::createPrimitive() {
prim.reset(new convolution_forward(prim_desc));
auto src = getParentEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
auto wei = getParentEdgesAtPort(1)[0]->getMemoryPtr()->GetPrimitive();
auto dst = getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
if (withBiases)
primArgs = {{DNNL_ARG_SRC, src}, {DNNL_ARG_WEIGHTS, getWeights()}, {DNNL_ARG_BIAS, getBias()}, {DNNL_ARG_DST, dst}};
else
primArgs = {{DNNL_ARG_SRC, src}, {DNNL_ARG_WEIGHTS, getWeights()}, {DNNL_ARG_DST, dst}};
if (withBiases) {
auto bias = getParentEdgesAtPort(2)[0]->getMemoryPtr()->GetPrimitive();
primArgs = {{DNNL_ARG_SRC, src}, {DNNL_ARG_WEIGHTS, wei}, {DNNL_ARG_BIAS, bias}, {DNNL_ARG_DST, dst}};
} else {
primArgs = {{DNNL_ARG_SRC, src}, {DNNL_ARG_WEIGHTS, wei}, {DNNL_ARG_DST, dst}};
}
}
bool MKLDNNConvolutionNode::created() const {
@ -605,55 +469,25 @@ void MKLDNNConvolutionNode::createDescriptor(const std::vector<InferenceEngine::
const std::vector<InferenceEngine::TensorDesc> &outputDesc) {
TensorDesc inDesc = inputDesc[0], outDesc = outputDesc[0];
mkldnn::memory::data_type wdt = precisionToDataType(inDesc.getPrecision());
mkldnn::memory::data_type bdt = precisionToDataType(inDesc.getPrecision());
if (inDesc.getPrecision() == Precision::BF16) {
bdt = mkldnn::memory::data_type::f32;
}
memory::data_type wdt = MKLDNNExtensionUtils::IEPrecisionToDataType(inDesc.getPrecision());
memory::data_type bdt = memory::data_type::f32;
if (inDesc.getPrecision() == Precision::U8 || inDesc.getPrecision() == Precision::I8) {
wdt = memory::data_type::s8;
bdt = baseInputsNumber == 3 ? precisionToDataType(getCnnLayer()->insData[2].lock()->getPrecision()) : memory::data_type::s32;
}
if (baseInputsNumber == 1) {
Blob::Ptr weights = this->getCnnLayer()->blobs.find("weights")->second;
if (weights->getTensorDesc().getPrecision() == Precision::I8) {
wdt = memory::data_type::s8;
bdt = memory::data_type::s32;
Precision outPrec;
if (getCnnLayer()->outData[0]->getPrecision() == Precision::FP32) {
outPrec = Precision::FP32;
} else {
// define precision accordninly normalizer
// TODO(amalyshe) do we need to have separate flow for last in int8 chain or not?
outPrec = outDesc.getPrecision();
}
inDesc = TensorDesc(inDesc.getPrecision(), inputDesc[0].getDims(), inputDesc[0].getBlockingDesc());
outDesc = TensorDesc(outPrec, outputDesc[0].getDims(), outputDesc[0].getBlockingDesc());
}
}
MKLDNNMemoryDesc in_candidate(inDesc);
MKLDNNMemoryDesc out_candidate(outDesc);
// grouping and autoblocking is not compatible
if (((isGrouped && !isDW) || isMerged) && (in_candidate.blocksExtended() || out_candidate.blocksExtended()))
return;
MKLDNNDims blocked_weightDims(weightDims);
MKLDNNDims blocked_biasesDims(biasesDims);
MKLDNNMemoryDesc wgh_candidate{blocked_weightDims, wdt, memory::format_tag::any};
std::vector<algorithm> algorithms;
// We cannot map wino_format on tensor descriptor for now
if (getBaseIntputsNumber() == 1) {
algorithms.push_back(algorithm::convolution_winograd);
}
algorithms.push_back(algorithm::convolution_direct);
std::vector<mkldnn::algorithm> algorithms;
// TODO [NM]: We cannot map wino_format on tensor descriptor for now
// algorithms.push_back(algorithm::convolution_winograd);
algorithms.push_back(mkldnn::algorithm::convolution_direct);
for (auto alg : algorithms) {
try {
@ -695,27 +529,6 @@ void MKLDNNConvolutionNode::addZeroPoints(mkldnn::primitive_attr& attr) const {
}
}
void MKLDNNConvolutionNode::addScaleToPrimitiveAttr(mkldnn::primitive_attr attr) const {
if (wScale != nullptr) {
float* wScaleData = static_cast<float*>(wScale->buffer());
std::vector<float> oScaleDataVector;
if (getCnnLayer()->precision == Precision::I8 && getCnnLayer()->outData[0]->getPrecision() != Precision::FP32) {
float *oScaleData = static_cast<float *>(oScale->buffer());
for (size_t c = 0; c < wScale->size(); c++) {
oScaleDataVector.push_back(wScaleData[c] / oScaleData[c]);
}
} else {
for (size_t c = 0; c < wScale->size(); c++) {
oScaleDataVector.push_back(wScaleData[c]);
}
}
attr.set_output_scales(1 << 1 /*through C dim*/, oScaleDataVector);
}
}
void MKLDNNConvolutionNode::initDescriptor(const InferenceEngine::LayerConfig& config) {
auto* selectedPD = getSelectedPrimitiveDescriptor();
if (!selectedPD) {
@ -725,18 +538,15 @@ void MKLDNNConvolutionNode::initDescriptor(const InferenceEngine::LayerConfig& c
// Strided blobs feature support.
// Works only for FP32 convolutions for now.
bool isStridedBlobsSupported = true;
for (auto &insData : getCnnLayer()->insData) {
if (insData.lock()->getPrecision() != InferenceEngine::Precision::FP32
&& insData.lock()->getPrecision() != InferenceEngine::Precision::BF16) {
isStridedBlobsSupported = false;
break;
}
}
// TODO: fix strided blobs feature support for dynamic weights
if (baseInputsNumber != 1) {
// TODO [NM]: refactor via using global executionPrecision.
if (canBeExecutedInInt8()) {
isStridedBlobsSupported = false;
}
// TODO [NM]: fix strided blobs feature support for dynamic weights
// if (getOriginalInputsNumber() != 1) {
// isStridedBlobsSupported = false;
// }
if (isStridedBlobsSupported) {
createDescriptor({config.inConfs[0].desc}, {config.outConfs[0].desc});
@ -745,7 +555,6 @@ void MKLDNNConvolutionNode::initDescriptor(const InferenceEngine::LayerConfig& c
mkldnn::primitive_attr attr;
addZeroPoints(attr);
setPostOps(attr);
addScaleToPrimitiveAttr(attr);
InferenceEngine::LayerConfig rightConfig = selectedPD->getConfig();
size_t selected_count = 0;
@ -768,8 +577,8 @@ void MKLDNNConvolutionNode::initDescriptor(const InferenceEngine::LayerConfig& c
cfg.inConfs.push_back(dataConfig);
}
if (withDWConv && baseInputsNumber > 1) {
auto weightsPrc = precisionToDataType(dw_conv_in_dt == mkldnn_u8 ? Precision::I8 : Precision::FP32);
if (withDWConv) {
auto weightsPrc = MKLDNNExtensionUtils::IEPrecisionToDataType(dw_conv_in_dt == mkldnn_u8 ? Precision::I8 : Precision::FP32);
auto biasPrc = memory::data_type::f32;
MKLDNNDims dwWeightsDims({dw_conv_oc, (ptrdiff_t)1, (ptrdiff_t)1, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS]});
@ -853,14 +662,14 @@ void MKLDNNConvolutionNode::filterSupportedDescriptors() {
}
}
bool MKLDNNConvolutionNode::isPossibleToSkipInitConfig(MKLDNNDescriptor &desc) {
bool MKLDNNConvolutionNode::isPossibleToSkipInitConfig(MKLDNNDescriptor &desc) const {
// WA: In some cases, we can predict in advance the type of primitive that will be called in the future.
// In particular, isPossibleToSkipInitConfig() checks whether we can skip the creation of primitives with
// gemm implementation, which significantly increase the network load time.
if (!inputMemoryFormatsFilter.empty() || !outputMemoryFormatsFilter.empty())
return false;
if (getCnnLayer()->params.find("PrimitivesPriority") != getCnnLayer()->params.end())
if (isPrimitivesPriorityDefined)
return false;
// Here we check that we will not delete jit_planar_conv primitive by mistake.
@ -920,12 +729,8 @@ MKLDNNMemoryDesc MKLDNNConvolutionNode::getSrcMemDesc(mkldnn::primitive_desc_ite
}
}
const mkldnn::memory& MKLDNNConvolutionNode::getWeights() const {
return baseInputsNumber > 1 ? getParentEdgeAt(1)->getMemory().GetPrimitive() : internalBlobMemory[0]->GetPrimitive();
}
const mkldnn::memory& MKLDNNConvolutionNode::getBias() const {
return baseInputsNumber > 2 ? getParentEdgeAt(2)->getMemory().GetPrimitive() : internalBlobMemory[1]->GetPrimitive();
bool MKLDNNConvolutionNode::canFuse(const MKLDNNNodePtr& node) const {
return canFuseSimpleOperation(node);
}
InferenceEngine::Precision MKLDNNConvolutionNode::getRuntimePrecision() const {

View File

@ -16,9 +16,10 @@ class MKLDNNEltwiseNode;
class MKLDNNConvolutionNode : public MKLDNNNode {
public:
MKLDNNConvolutionNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
MKLDNNConvolutionNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
~MKLDNNConvolutionNode() override = default;
static bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
void getSupportedDescriptors() override;
void createDescriptor(const std::vector<InferenceEngine::TensorDesc>& inputDesc,
const std::vector<InferenceEngine::TensorDesc>& outputDesc) override;
@ -26,50 +27,45 @@ public:
void createPrimitive() override;
void initSupportedPrimitiveDescriptors() override;
void filterSupportedPrimitiveDescriptors() override;
void filterSupportedDescriptors();
bool isPossibleToSkipInitConfig(MKLDNNDescriptor &desc);
bool created() const override;
bool canBeInPlace() const override {
return false;
}
void setPostOps(mkldnn::primitive_attr &attr, bool initWeights);
size_t descInputNumbers(MKLDNNDescriptor desc) override {
return static_cast<size_t>(baseInputsNumber);
}
int getBaseIntputsNumber() {
return baseInputsNumber;
}
MKLDNNMemoryDesc getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
const mkldnn::memory& getWeights() const;
const mkldnn::memory& getBias() const;
bool canBeExecutedInInt8();
InferenceEngine::Precision getRuntimePrecision() const override;
MKLDNNMemoryDesc getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
size_t descInputNumbers(MKLDNNDescriptor desc) override {
return static_cast<size_t>(getOriginalInputsNumber());
}
bool canBeExecutedInInt8() const;
size_t getGroupNum() const { return groupNum; }
std::vector<uint8_t> inputZeroPoints;
std::vector<float> weightsZeroPoints;
std::vector<int32_t> outputCompensation;
const InferenceEngine::SizeVector &getWeightDims() { return weightDims; }
const std::vector<ptrdiff_t> &getStride() { return stride; }
const std::vector<ptrdiff_t> &getDilation() { return dilation; }
const std::vector<ptrdiff_t> &getPaddingL() { return paddingL; }
const std::vector<ptrdiff_t> &getPaddingR() { return paddingR; }
bool canFuse(const MKLDNNNodePtr& node) const override;
protected:
void addScaleToPrimitiveAttr(mkldnn::primitive_attr attr) const;
InferenceEngine::Precision fusedEltwisePrecision(MKLDNNEltwiseNode *eltwiseNode, int findex);
InferenceEngine::Precision fusedEltwisePrecision(const MKLDNNNodePtr& fusingNode) const;
private:
mkldnn::memory::data_type precisionToDataType(InferenceEngine::Precision prec);
void addZeroPoints(mkldnn::primitive_attr& attr) const;
void setPostOps(mkldnn::primitive_attr &attr, bool initWeights) const ;
void filterSupportedDescriptors();
bool isPossibleToSkipInitConfig(MKLDNNDescriptor &desc) const;
bool withBiases;
bool withSum;
bool withDWConv;
bool isDW;
bool isMerged;
bool isGrouped;
bool isPrimitivesPriorityDefined;
std::vector<ptrdiff_t> stride;
std::vector<ptrdiff_t> dilation;
std::vector<ptrdiff_t> paddingL;
@ -83,14 +79,16 @@ private:
std::vector<ptrdiff_t> dw_conv_kernel;
std::vector<ptrdiff_t> dw_conv_strides;
mkldnn::memory::data_type dw_conv_in_dt;
std::vector<MKLDNNMemoryPtr> PostOpsIntBlobMemory;
InferenceEngine::Blob::Ptr wScale, oScale;
size_t groupNum;
int baseInputsNumber;
size_t IC;
size_t groupIC;
size_t groupOC;
InferenceEngine::Precision eltwisePrecision;
const size_t X_AXIS = 0;
const size_t Y_AXIS = 1;
};
} // namespace MKLDNNPlugin

View File

@ -6,15 +6,43 @@
#include "mkldnn_convert_node.h"
#include "common/cpu_convert.h"
#include "common/tensor_desc_creator.h"
#define THROW_ERROR IE_THROW() << getTypeStr() << " layer with name '" << getName() <<"' ERROR: "
#include <ngraph/opsets/opset1.hpp>
using namespace mkldnn;
using namespace MKLDNNPlugin;
using namespace InferenceEngine;
MKLDNNConvertNode::MKLDNNConvertNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) :
MKLDNNNode(layer, eng, cache) {}
bool MKLDNNConvertNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
try {
const auto convert = std::dynamic_pointer_cast<const ngraph::opset1::Convert>(op);
if (!convert) {
errorMessage = "Only opset1 Convert operation is supported";
return false;
}
} catch (...) {
return false;
}
return true;
}
MKLDNNConvertNode::MKLDNNConvertNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) :
MKLDNNNode(op, eng, cache) {
std::string errorMessage;
if (isSupportedOperation(op, errorMessage)) {
errorPrefix = "Convert node with name '" + getName() + "'";
} else {
IE_THROW(NotImplemented) << errorMessage;
}
}
MKLDNNConvertNode::MKLDNNConvertNode(const InferenceEngine::SizeVector &dims, const InferenceEngine::Precision &inPrc, const InferenceEngine::Precision &outPrc,
const std::string &nodeName, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache)
: MKLDNNNode("Convert", nodeName, eng, cache) {
inDims.emplace_back(dims);
addOriginalInputPrecision(inPrc);
outDims.emplace_back(dims);
addOriginalOutputPrecision(outPrc);
}
void MKLDNNConvertNode::getSupportedDescriptors() {
// if tensor descriptors are set via setDescs method we need to update the inDims/outDims data
@ -24,20 +52,15 @@ void MKLDNNConvertNode::getSupportedDescriptors() {
if (inDims.empty() && input && input->getLayout() != InferenceEngine::Layout::ANY)
inDims.push_back(MKLDNNDims(input->getDims()));
if (getParentEdges().size() != 1)
THROW_ERROR << "Incorrect number of input edges";
IE_THROW() << errorPrefix << " has incorrect number of input edges";
if (getChildEdges().empty())
THROW_ERROR << "Incorrect number of output edges";
IE_THROW() << errorPrefix << " has incorrect number of output edges";
}
void MKLDNNConvertNode::initSupportedPrimitiveDescriptors() {
if (!supportedPrimitiveDescriptors.empty())
return;
auto layer = getCnnLayer();
if (layer == nullptr) {
THROW_ERROR << "Cannot get CNN layer";
}
LayerConfig config;
DataConfig dataIn;
DataConfig dataConfigOut;
@ -54,16 +77,11 @@ void MKLDNNConvertNode::initSupportedPrimitiveDescriptors() {
dataConfigOut.desc = TensorDesc(output->getPrecision(), input->getDims(), blockingDesc);
config.outConfs.push_back(dataConfigOut);
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, MKLDNNMemoryDesc(config.outConfs.front().desc).getFormat());
} else if (layer->insData.size() == 1 && layer->outData.size() == 1) {
auto insData = layer->insData[0].lock();
if (nullptr == insData) {
THROW_ERROR << "Input data is empty";
}
const SizeVector& insDims = insData->getTensorDesc().getDims();
auto insPrecision = insData->getTensorDesc().getPrecision();
const SizeVector& outputDims = layer->outData[0]->getTensorDesc().getDims();
auto outPrecision = layer->outData[0]->getTensorDesc().getPrecision();
} else if (getOriginalInputsNumber() == 1 && getOriginalOutputsNumber() == 1) {
const SizeVector& insDims = getParentEdgeAt(0)->getDims().ToSizeVector();
auto insPrecision = getOriginalInputPrecisionAtPort(0);
const SizeVector& outputDims = getChildEdgeAt(0)->getDims().ToSizeVector();
auto outPrecision = getOriginalOutputPrecisionAtPort(0);
config.inConfs.push_back(dataIn);
config.outConfs.push_back(dataConfigOut);
@ -78,7 +96,7 @@ void MKLDNNConvertNode::initSupportedPrimitiveDescriptors() {
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, MKLDNNMemoryDesc(config.outConfs.front().desc).getFormat());
}
} else {
THROW_ERROR << "Incorrect number of input/output edges";
IE_THROW() << errorPrefix << " has incorrect number of input/output edges";
}
}
@ -86,18 +104,18 @@ void MKLDNNConvertNode::createPrimitive() {
auto& dstMemPtr = getChildEdgeAt(0)->getMemoryPtr();
auto& srcMemPtr = getParentEdgeAt(0)->getMemoryPtr();
if (!dstMemPtr || !dstMemPtr->GetPrimitivePtr())
THROW_ERROR << "Destination memory didn't allocate.";
IE_THROW() << errorPrefix << " has not allocated destination memory";
if (!srcMemPtr || !srcMemPtr->GetPrimitivePtr())
THROW_ERROR << "Input memory didn't allocate.";
IE_THROW() << errorPrefix << " has not allocated input memory";
if (getSelectedPrimitiveDescriptor() == nullptr)
THROW_ERROR << "Preferable primitive descriptor is not set.";
IE_THROW() << errorPrefix << " has nullable preferable primitive descriptor";
}
void MKLDNNConvertNode::execute(mkldnn::stream strm) {
auto& parentMem = getParentEdgeAt(0)->getMemory();
auto& childMem = getChildEdgeAt(0)->getMemory();
if (parentMem.GetElementsCount() != childMem.GetElementsCount())
THROW_ERROR << "Input and output buffers have different elements count";
IE_THROW() << errorPrefix << " has different elements number in input and output buffers";
void* srcPtr = parentMem.GetPtr();
void* dstPtr = childMem.GetPtr();
@ -107,4 +125,5 @@ void MKLDNNConvertNode::execute(mkldnn::stream strm) {
bool MKLDNNConvertNode::created() const {
return getType() == Convert;
}
REG_MKLDNN_PRIM_FOR(MKLDNNConvertNode, Convert);

View File

@ -13,7 +13,9 @@ namespace MKLDNNPlugin {
class MKLDNNConvertNode : public MKLDNNNode {
public:
MKLDNNConvertNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
MKLDNNConvertNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
MKLDNNConvertNode(const InferenceEngine::SizeVector &dims, const InferenceEngine::Precision &inPrc, const InferenceEngine::Precision &outPrc,
const std::string &nodeName, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
~MKLDNNConvertNode() override = default;
void getSupportedDescriptors() override;
@ -37,9 +39,13 @@ public:
std::shared_ptr<const InferenceEngine::TensorDesc> getInput() const { return input; }
std::shared_ptr<const InferenceEngine::TensorDesc> getOutput() const { return output; }
static bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
private:
std::shared_ptr<InferenceEngine::TensorDesc> input;
std::shared_ptr<InferenceEngine::TensorDesc> output;
std::string errorPrefix;
};
} // namespace MKLDNNPlugin

Some files were not shown because too many files have changed in this diff Show More