[CPU BF16] Bfloat16 inference optimizations (#2633)
* [CPU BF16] Greedy mode was added * [IE TESTS][BF16] Added support for operations with bf16 precision in the single layer tests. * Added cpu specific bfloat16 single layer tests for the jit_eltwise primitive. * [CPU TESTS] Activation and logical single layer tests fixes. * [IE TESTS] Fix activation single layer tests run. * [IE TESTS][CPU] CPUTestBase further refactoring. * [CPU BF16] Support for Bfloat16 type was added to the MVN layer. (#3) * [CPU BF16] MVN layer bfloat16 compatibility. * [CPU BF16] MVN bfloat16 minor fixes. * [CPU BF16] MVN node exception about BF16 support replaced with precision redefinition. * [CPU BF16] MVN layer bloat16 support fixed for quantization operations and blocking layout. * [CPU] Input and output precision checks were added to MVN layer. * [IE TESTS][CPU BF16] Most of the bloat16 tests have been fixed. * Bf16 crop layer (#4) * [IE TESTS][CPU] Cpu specific test for the Crop layer has been created. * [IE TESTS][CPU] Deprecated Crop single layer test removed. * [CPU BF16] Bfloat16 precision was added to the Crop layer. * [CPU BF16] Crop layer minor code improvements. * [IE TESTS][CPU] Crop layer test added 2D tensor tests. * [IE TESTS][CPU] Crop layer test, obsolete comment removed. * [IE TESTS][CPU] Fixed CropIE include path. * Crop test fix for older gcc compiler. * [CPU BF16] Reduce layer extended with bfloat16 support. * [IE TESTS][CPU] CPU specific single layer test for Reduce operation. * BF16 optimized layers * [CPU BF16] Bfloat16 custom type added to the MKLDNN plugin. * [CPU BF16] Mem alignment to 16 bytes added to bfloat16 class union. * [IE TESTS][CPU] Permute cpu specific single layer test and minor cpu tests fixes * MVN cpu single layer tests extended with nhwc ndhwc layouts. * Mod mode removed from Eltwise cpu single layer test. * Permute cpu specific single layer test. * Smoke keyword was added to the CPU single layer tests. * Normalize node was modified for BF16 support * [CPU BF16] The RegionYolo layer has been extended with the bfloat16 type support. * Resample node was extended with BF16 * Select layer was enabled with BF16 * psroi supports bf16 (#7) * reorders replaces converts (#9) * BF16 planar pooling was enabled * [CPU BF16] Cpu_convert added to the RegionYOLO node. * [IE TESTS][CPU] Crop single layer test has been rewritten using the StridedSlice operation. * [IE TESTS][CPU] Covert layer test extended with bf16 precision. * [CPU BF16] The bfloat16 class was renamed bfloat16_t and some refactoring has been done. * [CPU BF16] RegionYOLO and Softmax were aligned with the review. * [IE TESTS CPU] CPU single layer tests refactored according to the review suggestions. * [IE TESTS CPU] The Reduce CPU single layer test was extended with different mem orders. * [IE TESTS CPU] Minor fixes after the review. * [IE TESTS CPU] Common plugin configuration has been moved to PreparePluginConfiguration function. * Minor changes after review * StridedSlice, Select, ScaleShift notes were resolved * Fixes to the Reduce operation cpu test and minor fixes related to the review. * GPU eltwise tests fix. * psroi unrolled to the primary state; code clean (#12) * PSROIPooling layer with C++ optimizations * Minor fix for compatibility with CPUTestsBase for fuse_permute_reorder test. * Code clean & psroi rollbacked Co-authored-by: Maksim Kutakov <maksim.kutakov@intel.com> Co-authored-by: Maksim Kutakov <maxim.kutakov@gmail.com> Co-authored-by: Yury Gaydaychuk <yury.gaydaychuk@intel.com>
This commit is contained in:
parent
b7d5590d72
commit
2667bffa0d
@ -353,6 +353,9 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Convert>::createLayer(const std::shared_
|
||||
case Precision::FP16:
|
||||
precision_str = "FP16";
|
||||
break;
|
||||
case Precision::BF16:
|
||||
precision_str = "BF16";
|
||||
break;
|
||||
case Precision::FP32:
|
||||
precision_str = "FP32";
|
||||
break;
|
||||
|
0
inference-engine/src/legacy_api/src/ngraph_ops/interp.cpp
Normal file → Executable file
0
inference-engine/src/legacy_api/src/ngraph_ops/interp.cpp
Normal file → Executable file
@ -11,6 +11,7 @@
|
||||
#include <chrono>
|
||||
#include <legacy/details/ie_cnn_network_tools.h>
|
||||
#include <legacy/ie_util_internal.hpp>
|
||||
#include <legacy/graph_tools.hpp>
|
||||
#include "ngraph/type/bfloat16.hpp"
|
||||
|
||||
using namespace MKLDNNPlugin;
|
||||
@ -23,7 +24,7 @@ void precisionColoringBF16(const CNNLayerPtr layer,
|
||||
if (layer && !layer->insData.empty() && layer->input()) {
|
||||
printed_properties.insert(printed_properties.begin(),
|
||||
std::pair<std::string, std::string>("Precision",
|
||||
layer->input()->getPrecision() == Precision::FP32 ? "FP32" : "BF16"));
|
||||
layer->input()->getPrecision() == Precision::FP32 ? "FP32" : "BF16"));
|
||||
|
||||
if (layer->input()->getPrecision() == Precision::FP32) {
|
||||
node_properties.emplace_back("fillcolor", "#5A5DF0");
|
||||
@ -55,20 +56,31 @@ void BF16Transformer::convertToBFloat16(InferenceEngine::CNNNetwork &network) {
|
||||
InputsDataMap inputs = network.getInputsInfo();
|
||||
OutputsDataMap outputs = network.getOutputsInfo();
|
||||
for (auto iter : sortedLayers) {
|
||||
if (CaselessEq<std::string>()(iter->type, "convolution")) {
|
||||
auto dims = iter->insData[0].lock()->getDims();
|
||||
if ((dims.size() == 4 || dims.size() == 5) && (dims[1] == 1 || dims[1] == 3))
|
||||
continue;
|
||||
}
|
||||
|
||||
// check, if memory output node needs to be transformed
|
||||
if (iter->type == "Memory" && iter->outData.size() == 0 &&
|
||||
iter->insData[0].lock()->getPrecision() == Precision::FP32) {
|
||||
auto curPrec = iter->insData[0].lock()->getPrecision();
|
||||
iter->insData[0].lock()->setPrecision(Precision::BF16);
|
||||
}
|
||||
|
||||
for (size_t o = 0; o < iter->outData.size(); o++) {
|
||||
if (inputs.find(iter->outData[o]->getName()) == inputs.end()
|
||||
&& outputs.find(iter->outData[o]->getName()) == outputs.end()
|
||||
&& !CaselessEq<std::string>()(iter->type, "const")
|
||||
&& iter->outData[o]->getPrecision() == Precision::FP32) {
|
||||
iter->outData[o]->setPrecision(Precision::BF16);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// insert convert after input if necessary
|
||||
insertConvertAfterInput(network);
|
||||
|
||||
// convert all edges back to FP32 on demand
|
||||
optimizeToFloat(network);
|
||||
}
|
||||
@ -255,3 +267,120 @@ InferenceEngine::MemoryBlob::Ptr BF16Transformer::convertBF16ToFloat(InferenceEn
|
||||
}
|
||||
return weightsFP32;
|
||||
}
|
||||
void BF16Transformer::addLayerToCNNNetworkAfterData(
|
||||
DataPtr parentOutData,
|
||||
CNNLayer::Ptr layer,
|
||||
const std::string& nextLayerName,
|
||||
ICNNNetwork& net,
|
||||
const int childInsDataIndex) {
|
||||
CNNNetworkImpl* netImpl = dynamic_cast<CNNNetworkImpl*>(&net);
|
||||
if (netImpl == nullptr) {
|
||||
THROW_IE_EXCEPTION << "unexpected network type";
|
||||
}
|
||||
|
||||
CNNLayerPtr nextLayer;
|
||||
if (!nextLayerName.empty()) {
|
||||
netImpl->getLayerByName(nextLayerName.c_str(), nextLayer, nullptr);
|
||||
}
|
||||
|
||||
if (layer && (nextLayerName.empty() || (parentOutData == nullptr) || (childInsDataIndex != -1) ||
|
||||
(getInputTo(parentOutData).find(nextLayerName) != getInputTo(parentOutData).end()))) {
|
||||
auto getTensorDesc = [](CNNLayerPtr& nextLayer) {
|
||||
const DataPtr insData = nextLayer->insData[0].lock();
|
||||
return insData->getTensorDesc();
|
||||
};
|
||||
|
||||
const TensorDesc& parentTensorDesc = parentOutData != nullptr ? parentOutData->getTensorDesc() : getTensorDesc(nextLayer);
|
||||
DataPtr newEdgeAfterLayer(new Data(layer->name, parentTensorDesc));
|
||||
newEdgeAfterLayer->setName(layer->name);
|
||||
getCreatorLayer(newEdgeAfterLayer) = layer;
|
||||
getInputTo(newEdgeAfterLayer).clear();
|
||||
|
||||
|
||||
if (netImpl == nullptr) {
|
||||
THROW_IE_EXCEPTION << "unexpected network type";
|
||||
}
|
||||
netImpl->addData(layer->name.c_str(), newEdgeAfterLayer);
|
||||
IE_SUPPRESS_DEPRECATED_START
|
||||
netImpl->addLayer(layer);
|
||||
IE_SUPPRESS_DEPRECATED_END
|
||||
|
||||
if (parentOutData != nullptr) {
|
||||
getInputTo(parentOutData)[layer->name] = layer;
|
||||
layer->insData.push_back(parentOutData);
|
||||
}
|
||||
layer->outData.push_back(newEdgeAfterLayer);
|
||||
|
||||
if (!nextLayerName.empty()) {
|
||||
// CNNLayerPtr nextLayer = getInputTo(parentOutData)[nextLayerName];
|
||||
getInputTo(newEdgeAfterLayer)[nextLayerName] = nextLayer;
|
||||
|
||||
if (parentOutData != nullptr) {
|
||||
getInputTo(parentOutData).erase(nextLayerName);
|
||||
|
||||
if (childInsDataIndex == -1) {
|
||||
for (size_t i = 0; i < nextLayer->insData.size(); i++) {
|
||||
if (nextLayer->insData[i].lock() == parentOutData) {
|
||||
nextLayer->insData[i] = newEdgeAfterLayer;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
nextLayer->insData[childInsDataIndex] = newEdgeAfterLayer;
|
||||
}
|
||||
} else {
|
||||
nextLayer->insData.push_back(newEdgeAfterLayer);
|
||||
}
|
||||
} else {
|
||||
CNNLayerPtr parent = getCreatorLayer(parentOutData).lock();
|
||||
if (parent == nullptr) {
|
||||
THROW_IE_EXCEPTION << "parent data is absent";
|
||||
}
|
||||
netImpl->removeOutput(parent->name);
|
||||
netImpl->addData(layer->name.c_str(), newEdgeAfterLayer);
|
||||
netImpl->addOutput(layer->name);
|
||||
}
|
||||
} else {
|
||||
THROW_IE_EXCEPTION << "Invalid argument";
|
||||
}
|
||||
}
|
||||
|
||||
void BF16Transformer::insertConvertAfterInput(InferenceEngine::CNNNetwork &network) {
|
||||
auto inputLayers = InferenceEngine::CNNNetGetAllInputLayers(network);
|
||||
for (auto inputIter : inputLayers) {
|
||||
for (size_t o = 0; o < inputIter->outData.size(); o++) {
|
||||
for (auto bfInitIter : getInputTo(inputIter->outData[o])) {
|
||||
if (inputIter->outData[o]->getPrecision() == Precision::BF16) {
|
||||
// we don't need to enforce bf16-mode for the next layer
|
||||
break;
|
||||
}
|
||||
auto bfInitLayer = bfInitIter.second;
|
||||
if (_initbf16.find(bfInitLayer->type) != _initbf16.end()) {
|
||||
if (CaselessEq<std::string>()(bfInitLayer->type, "convolution")) {
|
||||
// TODO: have to be removed after adding suitable implementation for convolution
|
||||
break;
|
||||
}
|
||||
// insert convert
|
||||
std::string layerName = inputIter->outData[o]->getName();
|
||||
LayerParams cnnLayerParams{layerName, "Convert", Precision::FP32};
|
||||
auto lay = std::make_shared<InferenceEngine::CNNLayer>(cnnLayerParams);
|
||||
std::map<std::string, std::string> par = {{"name", layerName},
|
||||
{"type", "Convert"},
|
||||
{"precision", "FP32"}};
|
||||
lay->params = par;
|
||||
CNNLayerPtr convertLayer(lay);
|
||||
BF16Transformer::addLayerToCNNNetworkAfterData(inputIter->outData[o], convertLayer, bfInitLayer->name,
|
||||
network);
|
||||
// compute input port id for bfInitLayer
|
||||
for (size_t i = 0; i < bfInitLayer->insData.size(); i++) {
|
||||
if (bfInitLayer->insData[i].lock()->getName() == inputIter->outData[o]->getName()) {
|
||||
// set conv input as bf
|
||||
bfInitLayer->insData[i].lock()->setPrecision(Precision::BF16);
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -8,15 +8,22 @@
|
||||
#include <caseless.hpp>
|
||||
#include <string>
|
||||
#include <set>
|
||||
#include <legacy/details/ie_cnn_network_tools.h>
|
||||
|
||||
namespace MKLDNNPlugin {
|
||||
|
||||
class BF16Transformer {
|
||||
const InferenceEngine::details::caseless_set<std::string> _initbf16 =
|
||||
{ "convolution", "fullyconnected", "innerproduct", "gemm" };
|
||||
{ "convolution", "fullyconnected", "innerproduct", "gemm", "RegionYolo" };
|
||||
const InferenceEngine::details::caseless_set<std::string> _complementbf16 =
|
||||
{ "relu", "tanh", "elu", "square", "abs", "sqrt", "linear", "bounded_relu", "soft_relu", "logistic",
|
||||
"exp", "gelu", "clamp", "swish", "prelu", "pooling", "norm", "gather", "memory" };
|
||||
{ "relu", "tanh", "elu", "square", "abs", "sqrt", "linear", "bounded_relu", "soft_relu", "normalize",
|
||||
"sigmoid", "ReLU6", "not", "activation", "HSwish", "mish", "logistic", "mod", "resample",
|
||||
"exp", "gelu", "clamp", "swish", "prelu", "pooling", "norm", "gather", "memory", "mvn", "crop", "activation",
|
||||
"broadcast", "convert", "BatchToSpace", "DepthToSpace", "ExtractImagePatches", "concat", "power", "lrn",
|
||||
"permute", "ScatterUpdate", "ScatterElementsUpdate", "ScatterNDUpdate", "depthwise",
|
||||
"select", "ShuffleChannels", "SpaceToBatch", "SpaceToDepth", "squeeze", "StridedSlice", "unsqueeze", "eltwise",
|
||||
"ReduceAnd", "ReduceOr", "ReduceMax", "ReduceMin" };
|
||||
|
||||
const InferenceEngine::details::caseless_set<std::string> _multiinput =
|
||||
{ "concat", "eltwise" };
|
||||
// prevent fallback to fp32 without considering both input and output nodes
|
||||
@ -33,6 +40,13 @@ class BF16Transformer {
|
||||
*/
|
||||
bool tryToMarkFP32(InferenceEngine::DataPtr data, const std::set<InferenceEngine::DataPtr> &immutable);
|
||||
|
||||
/**
|
||||
* Because of singularity of input node, layer, following input doesn't support bf16 itself.
|
||||
* We fix it by insertion of convert layer, which has to be replaced to reorder in graph optimizer.
|
||||
*
|
||||
*/
|
||||
void insertConvertAfterInput(InferenceEngine::CNNNetwork &network);
|
||||
|
||||
public:
|
||||
/**
|
||||
* Restores Float point data types on edges which goes to non supported layers
|
||||
@ -61,6 +75,16 @@ public:
|
||||
*/
|
||||
void convertToBFloat16(InferenceEngine::CNNNetwork &network);
|
||||
|
||||
/**
|
||||
* inserts given layer after current tensor
|
||||
*/
|
||||
static void addLayerToCNNNetworkAfterData(
|
||||
InferenceEngine::DataPtr parentOutData,
|
||||
InferenceEngine::CNNLayerPtr layer,
|
||||
const std::string& nextLayerName,
|
||||
InferenceEngine::ICNNNetwork& net,
|
||||
const int childInsDataIndex = -1);
|
||||
|
||||
InferenceEngine::MemoryBlob::Ptr convertBF16ToFloat(InferenceEngine::MemoryBlob::Ptr);
|
||||
};
|
||||
|
||||
|
@ -145,6 +145,9 @@ void MKLDNNGraphOptimizer::ApplyImplSpecificGraphOptimizations(MKLDNNGraph &grap
|
||||
graph.RemoveDroppedNodes();
|
||||
|
||||
#if defined (COMPILED_CPU_MKLDNN_REORDER_NODE)
|
||||
ChangeConvertToReorder(graph);
|
||||
graph.RemoveDroppedNodes();
|
||||
|
||||
DropDoubleReorders(graph);
|
||||
graph.RemoveDroppedNodes();
|
||||
|
||||
@ -1918,6 +1921,55 @@ void MKLDNNGraphOptimizer::DropConvertReorder(MKLDNNGraph& graph) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void MKLDNNGraphOptimizer::ChangeConvertToReorder(MKLDNNGraph& graph) {
|
||||
std::vector<Precision> continuousPrecisions{
|
||||
Precision::BF16,
|
||||
Precision::FP32
|
||||
};
|
||||
for (int ind = 0; ind < graph.GetNodes().size(); ind++) {
|
||||
auto convertCandidate = graph.GetNodes().at(ind);
|
||||
std::string nodeType = convertCandidate->getTypeStr();
|
||||
if (!InferenceEngine::details::CaselessEq<std::string>()(nodeType, "convert")) {
|
||||
continue;
|
||||
}
|
||||
auto inputPrecision = convertCandidate->getCnnLayer()->insData[0].lock()->getPrecision();
|
||||
auto outputPrecision = convertCandidate->getCnnLayer()->outData[0]->getPrecision();
|
||||
if (std::find(continuousPrecisions.begin(), continuousPrecisions.end(), inputPrecision) == continuousPrecisions.end() ||
|
||||
std::find(continuousPrecisions.begin(), continuousPrecisions.end(), outputPrecision) == continuousPrecisions.end()) {
|
||||
continue;
|
||||
}
|
||||
std::unordered_set<std::string> uniqueLayerNames;
|
||||
for (auto node : graph.GetNodes()) {
|
||||
uniqueLayerNames.insert(node->getCnnLayer()->name);
|
||||
}
|
||||
auto parentEdge = convertCandidate->getParentEdges()[0].lock();
|
||||
auto parentNode = parentEdge->getParent();
|
||||
auto &childEdge = convertCandidate->getChildEdgeAt(0);
|
||||
auto childNode = childEdge->getChild();
|
||||
std::string basicLayerName = childEdge->getParent()->getName() + "_" +
|
||||
MKLDNNExtensionUtils::getReorderArgs(convertCandidate->getCnnLayer()->insData[0].lock()->getTensorDesc(),
|
||||
convertCandidate->getCnnLayer()->outData[0]->getTensorDesc()) +
|
||||
"_" + childEdge->getChild()->getName();
|
||||
std::string layerName = basicLayerName;
|
||||
int idx = 0;
|
||||
while (uniqueLayerNames.find(layerName) != uniqueLayerNames.end()) {
|
||||
idx++;
|
||||
layerName = basicLayerName + "_" + std::to_string(idx);
|
||||
}
|
||||
// create temporary edge
|
||||
auto oldParentOutputPort = parentEdge->getInputNum();
|
||||
auto oldChildInputPort = childEdge->getOutputNum();
|
||||
MKLDNNEdgePtr tempEdge(new MKLDNNEdge(parentNode, childNode, oldParentOutputPort, oldChildInputPort));
|
||||
|
||||
graph.InsertReorder(tempEdge, layerName, convertCandidate->getCnnLayer()->insData[0].lock()->getTensorDesc(),
|
||||
convertCandidate->getCnnLayer()->outData[0]->getTensorDesc(), false);
|
||||
parentNode->removeEdge(parentEdge);
|
||||
parentEdge->drop();
|
||||
childEdge->drop();
|
||||
graph.DropNode(convertCandidate);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void MKLDNNGraphOptimizer::RemoveIOScaleShifts(MKLDNNGraph &graph) {
|
||||
|
@ -46,6 +46,7 @@ private:
|
||||
#if defined (COMPILED_CPU_MKLDNN_REORDER_NODE)
|
||||
void DropDoubleReorders(MKLDNNGraph& graph);
|
||||
void DropConvertReorder(MKLDNNGraph& graph);
|
||||
void ChangeConvertToReorder(MKLDNNGraph &graph);
|
||||
#endif
|
||||
void FuseConvolutionAndZeroPoints(MKLDNNGraph &graph);
|
||||
void FuseBroadcastAndEltwise(MKLDNNGraph &graph);
|
||||
|
@ -105,6 +105,7 @@ void MKLDNNPlugin::MKLDNNInferRequest::PushInputData() {
|
||||
// these precisions are supported by mkldnn, so we push the blob directly
|
||||
case InferenceEngine::Precision::I8:
|
||||
case InferenceEngine::Precision::I32:
|
||||
case InferenceEngine::Precision::BF16:
|
||||
case InferenceEngine::Precision::FP32: {
|
||||
break;
|
||||
}
|
||||
|
@ -278,6 +278,7 @@ Engine::LoadExeNetworkImpl(const InferenceEngine::ICNNNetwork &network, const st
|
||||
input_precision != InferenceEngine::Precision::I16 &&
|
||||
input_precision != InferenceEngine::Precision::I8 &&
|
||||
input_precision != InferenceEngine::Precision::U8 &&
|
||||
input_precision != InferenceEngine::Precision::BF16 &&
|
||||
input_precision != InferenceEngine::Precision::BOOL &&
|
||||
input_precision != InferenceEngine::Precision::I64 &&
|
||||
input_precision != InferenceEngine::Precision::U64) {
|
||||
|
@ -27,7 +27,7 @@ public:
|
||||
conf.axis_index_ = conf.has_axis_ ?
|
||||
std::stoi(layer->params.at("axis")) :0;
|
||||
|
||||
addConfig(layer, {DataConfigurator(ConfLayout::PLN)}, {DataConfigurator(ConfLayout::PLN)});
|
||||
addConfig(layer, {DataConfigurator(ConfLayout::PLN, Precision::FP32)}, {DataConfigurator(ConfLayout::PLN, Precision::FP32)});
|
||||
} catch (InferenceEngine::details::InferenceEngineException &ex) {
|
||||
errorMsg = ex.what();
|
||||
}
|
||||
|
@ -60,8 +60,8 @@ protected:
|
||||
explicit DataConfigurator(ConfLayout l):
|
||||
layout(l) {}
|
||||
|
||||
DataConfigurator(ConfLayout l, bool constant, int inplace = -1):
|
||||
layout(l), constant(constant), inplace(inplace) {}
|
||||
DataConfigurator(ConfLayout l, bool constant, int inplace = -1, Precision::ePrecision prc = Precision::UNSPECIFIED):
|
||||
layout(l), constant(constant), inplace(inplace), prc(prc) {}
|
||||
|
||||
DataConfigurator(ConfLayout l, Precision::ePrecision prc):
|
||||
layout(l), prc(prc) {}
|
||||
@ -128,14 +128,7 @@ protected:
|
||||
conf.layout = ConfLayout::PLN;
|
||||
}
|
||||
|
||||
// All extension layers support only FP32 precision!
|
||||
// fixing of BF16 precisions where they are - layers naturally support only FP32
|
||||
// if we see BF16, that means another floating point format which will be converted by reorder
|
||||
// added by current mkl-dnn cpu plugin when it figure out diff in data types on input and output of edges
|
||||
InferenceEngine::Precision precision = (conf.prc == Precision::UNSPECIFIED) ? data_desc.getPrecision() : Precision(conf.prc);
|
||||
if (precision == Precision::BF16) {
|
||||
precision = Precision::FP32;
|
||||
}
|
||||
if (conf.layout == ConfLayout::ANY) {
|
||||
dataConfig.desc = TensorDesc(precision, data_dims, InferenceEngine::Layout::ANY);
|
||||
} else {
|
||||
|
@ -31,7 +31,7 @@ public:
|
||||
|
||||
LayerConfig config;
|
||||
DataConfig dataConfig, shapeConfig;
|
||||
Precision dataPrecision = layer->outData[0]->getTensorDesc().getPrecision();
|
||||
Precision dataPrecision = layer->insData[BROADCAST_INPUT].lock()->getTensorDesc().getPrecision();
|
||||
const SizeVector& data_dims = layer->insData[BROADCAST_INPUT].lock()->getTensorDesc().getDims();
|
||||
dataConfig.desc = TensorDesc(dataPrecision, data_dims,
|
||||
layer->insData[BROADCAST_INPUT].lock()->getTensorDesc().getLayout());
|
||||
|
@ -41,19 +41,16 @@ public:
|
||||
input_precision = input->getTensorDesc().getPrecision();
|
||||
if (input_precision != Precision::FP32 && input_precision != Precision::I32 &&
|
||||
input_precision != Precision::I64) {
|
||||
THROW_IE_EXCEPTION << layer->name
|
||||
<< " Incorrect input precision of the input. Only FP32, I32 and I64 are supported!";
|
||||
input_precision = Precision::FP32;
|
||||
}
|
||||
boundaries_precision = boundaries->getTensorDesc().getPrecision();
|
||||
if (boundaries_precision != Precision::FP32 && boundaries_precision != Precision::I32 &&
|
||||
boundaries_precision != Precision::I64) {
|
||||
THROW_IE_EXCEPTION << layer->name
|
||||
<< " Incorrect input precision of the boundaries tensor. Only FP32, I32 and I64 are supported!";
|
||||
boundaries_precision = Precision::FP32;
|
||||
}
|
||||
output_precision = layer->outData[OUTPUT_TENSOR_PORT]->getTensorDesc().getPrecision();
|
||||
if (output_precision != Precision::I32 && output_precision != Precision::I64) {
|
||||
THROW_IE_EXCEPTION << layer->name
|
||||
<< " Incorrect precision of the output tensor. Only I32 and I64 are supported!";
|
||||
output_precision = Precision::I32;
|
||||
}
|
||||
|
||||
// check dimensions of input tensors
|
||||
@ -73,8 +70,8 @@ public:
|
||||
num_values = std::accumulate(input_tensor_dims.begin(), input_tensor_dims.end(), 1, std::multiplies<size_t>());
|
||||
|
||||
addConfig(layer,
|
||||
{ DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) },
|
||||
{ DataConfigurator(ConfLayout::PLN) });
|
||||
{ DataConfigurator(ConfLayout::PLN, input_precision), DataConfigurator(ConfLayout::PLN, boundaries_precision) },
|
||||
{ DataConfigurator(ConfLayout::PLN, output_precision) });
|
||||
}
|
||||
catch (InferenceEngine::details::InferenceEngineException &ex) {
|
||||
errorMsg = ex.what();
|
||||
|
@ -4,13 +4,14 @@
|
||||
|
||||
#include "cpu_convert.h"
|
||||
#include "cpu_memcpy.h"
|
||||
#include "utils/bfloat16.hpp"
|
||||
#include <type_traits>
|
||||
#include <ie_parallel.hpp>
|
||||
|
||||
using namespace InferenceEngine;
|
||||
|
||||
template<typename srcType, typename dstType>
|
||||
void convert(void *srcPtr, void *dstPtr, const size_t size) {
|
||||
void convert(const void *srcPtr, void *dstPtr, const size_t size) {
|
||||
if (std::is_same<srcType, dstType>::value) {
|
||||
cpu_memcpy(dstPtr, srcPtr, size*sizeof(dstType));
|
||||
} else {
|
||||
@ -24,7 +25,7 @@ void convert(void *srcPtr, void *dstPtr, const size_t size) {
|
||||
}
|
||||
|
||||
template <typename srcType>
|
||||
void convertFrom(void *srcPtr, void *dstPtr, Precision dstPrc, const size_t size) {
|
||||
void convertFrom(const void *srcPtr, void *dstPtr, Precision dstPrc, const size_t size) {
|
||||
switch (dstPrc) {
|
||||
case Precision::U8:
|
||||
convert<srcType, PrecisionTrait<Precision::U8>::value_type>(srcPtr, dstPtr, size);
|
||||
@ -50,6 +51,9 @@ void convertFrom(void *srcPtr, void *dstPtr, Precision dstPrc, const size_t size
|
||||
case Precision::FP32:
|
||||
convert<srcType, PrecisionTrait<Precision::FP32>::value_type>(srcPtr, dstPtr, size);
|
||||
break;
|
||||
case Precision::BF16:
|
||||
convert<srcType, MKLDNNPlugin::bfloat16_t>(srcPtr, dstPtr, size);
|
||||
break;
|
||||
case Precision::BOOL:
|
||||
convert<srcType, PrecisionTrait<Precision::BOOL>::value_type>(srcPtr, dstPtr, size);
|
||||
break;
|
||||
@ -58,7 +62,7 @@ void convertFrom(void *srcPtr, void *dstPtr, Precision dstPrc, const size_t size
|
||||
}
|
||||
}
|
||||
|
||||
void cpu_convert(void *srcPtr, void *dstPtr, Precision srcPrc, Precision dstPrc, const size_t size) {
|
||||
void cpu_convert(const void *srcPtr, void *dstPtr, Precision srcPrc, Precision dstPrc, const size_t size) {
|
||||
if (srcPtr == nullptr || dstPtr == nullptr)
|
||||
THROW_IE_EXCEPTION << "cpu_convert has null data pointer";
|
||||
|
||||
@ -92,6 +96,9 @@ void cpu_convert(void *srcPtr, void *dstPtr, Precision srcPrc, Precision dstPrc,
|
||||
case Precision::FP32:
|
||||
convertFrom<PrecisionTrait<Precision::FP32>::value_type>(srcPtr, dstPtr, dstPrc, size);
|
||||
break;
|
||||
case Precision::BF16:
|
||||
convertFrom<MKLDNNPlugin::bfloat16_t>(srcPtr, dstPtr, dstPrc, size);
|
||||
break;
|
||||
case Precision::BOOL:
|
||||
convertFrom<PrecisionTrait<Precision::BOOL>::value_type>(srcPtr, dstPtr, dstPrc, size);
|
||||
break;
|
||||
|
@ -20,4 +20,4 @@
|
||||
* @return none.
|
||||
*/
|
||||
|
||||
void cpu_convert(void *srcPtr, void *dstPtr, InferenceEngine::Precision srcPrc, InferenceEngine::Precision dstPrc, const size_t size);
|
||||
void cpu_convert(const void *srcPtr, void *dstPtr, InferenceEngine::Precision srcPrc, InferenceEngine::Precision dstPrc, const size_t size);
|
||||
|
@ -3,25 +3,35 @@
|
||||
//
|
||||
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <ie_parallel.hpp>
|
||||
#include <mkldnn_extension_utils.h>
|
||||
#include "jit_generator.hpp"
|
||||
#include "jit_uni_eltwise.hpp"
|
||||
#include "utils/bfloat16.hpp"
|
||||
#include "softmax.h"
|
||||
|
||||
using namespace InferenceEngine;
|
||||
using namespace MKLDNNPlugin;
|
||||
using namespace mkldnn;
|
||||
using namespace mkldnn::impl::cpu;
|
||||
using namespace mkldnn::impl::utils;
|
||||
|
||||
#define GET_OFF(field) offsetof(jit_args_softmax, field)
|
||||
|
||||
struct jit_args_softmax {
|
||||
const float* src;
|
||||
const float* dst;
|
||||
size_t stride;
|
||||
const void* src;
|
||||
void* dst;
|
||||
size_t src_stride;
|
||||
size_t dst_stride;
|
||||
size_t work_amount;
|
||||
};
|
||||
|
||||
struct jit_softmax_config_params {
|
||||
Precision src_dt;
|
||||
Precision dst_dt;
|
||||
};
|
||||
|
||||
|
||||
struct jit_uni_softmax_kernel {
|
||||
void (*ker_)(const jit_args_softmax *);
|
||||
|
||||
@ -35,14 +45,15 @@ template <cpu_isa_t isa>
|
||||
struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_generator {
|
||||
DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_softmax_kernel_f32)
|
||||
|
||||
jit_uni_softmax_kernel_f32() : jit_uni_softmax_kernel(), jit_generator() {
|
||||
jit_uni_softmax_kernel_f32(jit_softmax_config_params jcp) : jit_uni_softmax_kernel(), jit_generator() {
|
||||
exp_injector.reset(new jit_uni_eltwise_injector_f32<isa>(this, alg_kind::eltwise_exp, 0.f, 0.f));
|
||||
|
||||
this->preamble();
|
||||
|
||||
mov(reg_src, ptr[reg_params + GET_OFF(src)]);
|
||||
mov(reg_dst, ptr[reg_params + GET_OFF(dst)]);
|
||||
mov(reg_stride, ptr[reg_params + GET_OFF(stride)]);
|
||||
mov(reg_src_stride, ptr[reg_params + GET_OFF(src_stride)]);
|
||||
mov(reg_dst_stride, ptr[reg_params + GET_OFF(dst_stride)]);
|
||||
mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]);
|
||||
|
||||
Xbyak::Label max_loop_label;
|
||||
@ -54,12 +65,12 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge
|
||||
|
||||
mov(aux_reg_work_amount, reg_work_amount);
|
||||
mov(aux_reg_src, reg_src);
|
||||
uni_vmovups(vmm_max, ptr[aux_reg_src]);
|
||||
load_vector(vmm_max, ptr[aux_reg_src], jcp.src_dt);
|
||||
L(max_loop_label); {
|
||||
cmp(aux_reg_work_amount, 0);
|
||||
jle(max_loop_end_label, T_NEAR);
|
||||
|
||||
uni_vmovups(vmm_val, ptr[aux_reg_src]);
|
||||
load_vector(vmm_val, ptr[aux_reg_src], jcp.src_dt);
|
||||
|
||||
if (isa == sse42) {
|
||||
uni_vmovups(vmm_mask, vmm_val);
|
||||
@ -77,7 +88,7 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge
|
||||
uni_vblendvps(vmm_max, vmm_max, vmm_val, vmm_mask);
|
||||
}
|
||||
|
||||
add(aux_reg_src, reg_stride);
|
||||
add(aux_reg_src, reg_src_stride);
|
||||
sub(aux_reg_work_amount, 1);
|
||||
|
||||
jmp(max_loop_label, T_NEAR);
|
||||
@ -93,16 +104,16 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge
|
||||
cmp(aux_reg_work_amount, 0);
|
||||
jle(exp_loop_end_label, T_NEAR);
|
||||
|
||||
uni_vmovups(vmm_val, ptr[aux_reg_src]);
|
||||
load_vector(vmm_val, ptr[aux_reg_src], jcp.src_dt);
|
||||
|
||||
uni_vsubps(vmm_val, vmm_val, vmm_max);
|
||||
exp_injector->compute_vector_range(vmm_val.getIdx(), vmm_val.getIdx() + 1);
|
||||
uni_vaddps(vmm_exp_sum, vmm_exp_sum, vmm_val);
|
||||
|
||||
uni_vmovups(ptr[aux_reg_dst], vmm_val);
|
||||
store_vector(ptr[aux_reg_dst], vmm_val, jcp.dst_dt);
|
||||
|
||||
add(aux_reg_src, reg_stride);
|
||||
add(aux_reg_dst, reg_stride);
|
||||
add(aux_reg_src, reg_src_stride);
|
||||
add(aux_reg_dst, reg_dst_stride);
|
||||
sub(aux_reg_work_amount, 1);
|
||||
|
||||
jmp(exp_loop_label, T_NEAR);
|
||||
@ -116,13 +127,13 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge
|
||||
cmp(aux_reg_work_amount, 0);
|
||||
jle(div_loop_end_label, T_NEAR);
|
||||
|
||||
uni_vmovups(vmm_val, ptr[aux_reg_dst]);
|
||||
load_vector(vmm_val, ptr[aux_reg_dst], jcp.dst_dt);
|
||||
|
||||
uni_vdivps(vmm_val, vmm_val, vmm_exp_sum);
|
||||
|
||||
uni_vmovups(ptr[aux_reg_dst], vmm_val);
|
||||
store_vector(ptr[aux_reg_dst], vmm_val, jcp.dst_dt);
|
||||
|
||||
add(aux_reg_dst, reg_stride);
|
||||
add(aux_reg_dst, reg_dst_stride);
|
||||
sub(aux_reg_work_amount, 1);
|
||||
|
||||
jmp(div_loop_label, T_NEAR);
|
||||
@ -147,7 +158,8 @@ private:
|
||||
Xbyak::Reg64 aux_reg_dst = r15;
|
||||
Xbyak::Reg64 reg_work_amount = r11;
|
||||
Xbyak::Reg64 aux_reg_work_amount = r12;
|
||||
Xbyak::Reg64 reg_stride = r14;
|
||||
Xbyak::Reg64 reg_src_stride = r14;
|
||||
Xbyak::Reg64 reg_dst_stride = r10;
|
||||
Xbyak::Reg64 reg_params = abi_param1;
|
||||
|
||||
Vmm vmm_mask = Vmm(0);
|
||||
@ -158,23 +170,64 @@ private:
|
||||
const Xbyak::Opmask k_mask = Xbyak::Opmask(1);
|
||||
|
||||
std::shared_ptr<jit_uni_eltwise_injector_f32<isa>> exp_injector;
|
||||
|
||||
inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, Precision src_dt) {
|
||||
switch (src_dt) {
|
||||
case Precision::FP32:
|
||||
uni_vmovups(vmm_src, op);
|
||||
break;
|
||||
case Precision::BF16:
|
||||
vpmovzxwd(vmm_src, op);
|
||||
uni_vpslld(vmm_src, vmm_src, 16);
|
||||
break;
|
||||
default:
|
||||
assert(!"unknown src_dt");
|
||||
}
|
||||
}
|
||||
inline void store_vector(const Xbyak::Address &op, Vmm vmm_dst, Precision dst_dt) {
|
||||
Xbyak::Ymm ymm_dst = Xbyak::Ymm(vmm_dst.getIdx());
|
||||
|
||||
switch (dst_dt) {
|
||||
case Precision::FP32:
|
||||
uni_vmovups(op, vmm_dst);
|
||||
break;
|
||||
case Precision::BF16:
|
||||
vcvtneps2bf16(ymm_dst, vmm_dst);
|
||||
uni_vmovups(op, ymm_dst);
|
||||
break;
|
||||
default:
|
||||
assert(!"unknown dst_dt");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
SoftmaxGeneric::SoftmaxGeneric() {
|
||||
SoftmaxGeneric::SoftmaxGeneric(Precision inpPrc, Precision outPrc)
|
||||
: input_prec(inpPrc), output_prec(outPrc) {
|
||||
if (Precision::BF16 == output_prec) {
|
||||
if (!mayiuse(avx512_core_bf16)) {
|
||||
THROW_IE_EXCEPTION << "SoftmaxGeneric doesn't support BF16 precision on this target.";
|
||||
}
|
||||
}
|
||||
|
||||
block_size = 1;
|
||||
auto jcp = jit_softmax_config_params();
|
||||
jcp.src_dt = inpPrc;
|
||||
jcp.dst_dt = outPrc;
|
||||
|
||||
if (mayiuse(avx512_common)) {
|
||||
softmax_kernel.reset(new jit_uni_softmax_kernel_f32<avx512_common>());
|
||||
softmax_kernel.reset(new jit_uni_softmax_kernel_f32<avx512_common>(jcp));
|
||||
block_size = 16;
|
||||
} else if (mayiuse(avx2)) {
|
||||
softmax_kernel.reset(new jit_uni_softmax_kernel_f32<avx2>());
|
||||
softmax_kernel.reset(new jit_uni_softmax_kernel_f32<avx2>(jcp));
|
||||
block_size = 8;
|
||||
} else if (mayiuse(sse42)) {
|
||||
softmax_kernel.reset(new jit_uni_softmax_kernel_f32<sse42>());
|
||||
softmax_kernel.reset(new jit_uni_softmax_kernel_f32<sse42>(jcp));
|
||||
block_size = 4;
|
||||
}
|
||||
}
|
||||
|
||||
void SoftmaxGeneric::execute(const float *src_data, float *dst_data, int B, int C, int H, int W) {
|
||||
template<typename in_data_t, typename out_data_t>
|
||||
void SoftmaxGeneric::calculate(const in_data_t *src_data, out_data_t *dst_data, int B, int C, int H, int W) {
|
||||
for (int b = 0; b < B; b++) {
|
||||
int tail_start = 0;
|
||||
if (softmax_kernel) {
|
||||
@ -185,7 +238,8 @@ void SoftmaxGeneric::execute(const float *src_data, float *dst_data, int B, int
|
||||
|
||||
arg.src = src_data + b * C * H * W + ib * block_size;
|
||||
arg.dst = dst_data + b * C * H * W + ib * block_size;
|
||||
arg.stride = static_cast<size_t>((size_t)(H) * W * sizeof(float));
|
||||
arg.src_stride = static_cast<size_t>((size_t)(H) * W * sizeof(in_data_t));
|
||||
arg.dst_stride = static_cast<size_t>((size_t)(H) * W * sizeof(out_data_t));
|
||||
arg.work_amount = static_cast<size_t>(C);
|
||||
|
||||
(*softmax_kernel)(&arg);
|
||||
@ -214,3 +268,31 @@ void SoftmaxGeneric::execute(const float *src_data, float *dst_data, int B, int
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void SoftmaxGeneric::execute(const uint8_t *src_data, uint8_t *dst_data, int B, int C, int H, int W) {
|
||||
if (Precision::FP32 == input_prec) {
|
||||
auto float_src_data = reinterpret_cast<const float*>(src_data);
|
||||
if (Precision::FP32 == output_prec) {
|
||||
auto float_dst_data = reinterpret_cast<float*>(dst_data);
|
||||
calculate(float_src_data, float_dst_data, B, C, H, W);
|
||||
} else if (Precision::BF16 == output_prec) {
|
||||
auto bf16_dst_data = reinterpret_cast<bfloat16_t*>(dst_data);
|
||||
calculate(float_src_data, bf16_dst_data, B, C, H, W);
|
||||
} else {
|
||||
THROW_IE_EXCEPTION << "Unsupported output precision: " << output_prec.name();
|
||||
}
|
||||
} else if (Precision::BF16 == input_prec) {
|
||||
auto bf16_src_data = reinterpret_cast<const bfloat16_t*>(src_data);
|
||||
if (Precision::FP32 == output_prec) {
|
||||
auto float_dst_data = reinterpret_cast<float*>(dst_data);
|
||||
calculate(bf16_src_data, float_dst_data, B, C, H, W);
|
||||
} else if (Precision::BF16 == output_prec) {
|
||||
auto bf16_dst_data = reinterpret_cast<bfloat16_t*>(dst_data);
|
||||
calculate(bf16_dst_data, bf16_dst_data, B, C, H, W);
|
||||
} else {
|
||||
THROW_IE_EXCEPTION << "Unsupported output precision: " << output_prec.name();
|
||||
}
|
||||
} else {
|
||||
THROW_IE_EXCEPTION << "Unsupported input precision: " << input_prec.name();
|
||||
}
|
||||
}
|
||||
|
@ -6,6 +6,7 @@
|
||||
|
||||
#include <memory>
|
||||
#include <cmath>
|
||||
#include <ie_precision.hpp>
|
||||
#include "defs.h"
|
||||
#include "ie_parallel.hpp"
|
||||
|
||||
@ -37,12 +38,16 @@ void softmax_many_batches(const float *src_data, float *dst_data, int B, int C,
|
||||
|
||||
class SoftmaxGeneric {
|
||||
public:
|
||||
SoftmaxGeneric();
|
||||
SoftmaxGeneric(InferenceEngine::Precision inpPrc, InferenceEngine::Precision outPrc);
|
||||
|
||||
void execute(const float *src_data, float *dst_data, int B, int C, int H, int W);
|
||||
void execute(const uint8_t *src_data, uint8_t *dst_data, int B, int C, int H, int W);
|
||||
private:
|
||||
template<typename in_data_t, typename out_data_t>
|
||||
void calculate(const in_data_t* src_data, out_data_t* dst_data, int B, int C, int H, int W);
|
||||
|
||||
private:
|
||||
int block_size;
|
||||
InferenceEngine::Precision input_prec, output_prec;
|
||||
std::shared_ptr<jit_uni_softmax_kernel> softmax_kernel;
|
||||
};
|
||||
|
||||
|
@ -4,10 +4,8 @@
|
||||
|
||||
#include "base.hpp"
|
||||
|
||||
#include <cmath>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "ie_parallel.hpp"
|
||||
#include "ie_precision.hpp"
|
||||
#include "common/cpu_convert.h"
|
||||
|
||||
|
@ -20,8 +20,8 @@ public:
|
||||
THROW_IE_EXCEPTION << "Incorrect number of input/output edges!";
|
||||
|
||||
std::vector<DataConfigurator> inps;
|
||||
inps.resize(layer->insData.size(), DataConfigurator(ConfLayout::PLN));
|
||||
addConfig(layer, inps, {DataConfigurator(ConfLayout::PLN)});
|
||||
inps.resize(layer->insData.size(), DataConfigurator(ConfLayout::PLN, Precision::FP32));
|
||||
addConfig(layer, inps, {DataConfigurator(ConfLayout::PLN, Precision::FP32)});
|
||||
} catch (InferenceEngine::details::InferenceEngineException &ex) {
|
||||
errorMsg = ex.what();
|
||||
}
|
||||
|
@ -27,13 +27,10 @@ public:
|
||||
auto logitsData = layer->insData[0].lock();
|
||||
if (logitsData == nullptr)
|
||||
THROW_IE_EXCEPTION << _logPrefix << " has nullable logits data";
|
||||
auto logitsPrecision = logitsData->getTensorDesc().getPrecision();
|
||||
if (logitsPrecision == Precision::BF16)
|
||||
logitsPrecision = Precision::FP32;
|
||||
|
||||
LayerConfig config;
|
||||
config.inConfs.resize(layer->insData.size());
|
||||
config.inConfs[0].desc = TensorDesc(logitsPrecision,
|
||||
config.inConfs[0].desc = TensorDesc(Precision::FP32,
|
||||
logitsData->getTensorDesc().getDims(),
|
||||
TensorDesc::getLayoutByDims(logitsData->getTensorDesc().getDims()));
|
||||
auto intPrecision = Precision::I32;
|
||||
@ -48,7 +45,7 @@ public:
|
||||
|
||||
DataConfig outConfig;
|
||||
auto& outDims = layer->outData[0]->getTensorDesc().getDims();
|
||||
outConfig.desc = TensorDesc(logitsPrecision,
|
||||
outConfig.desc = TensorDesc(Precision::FP32,
|
||||
outDims,
|
||||
TensorDesc::getLayoutByDims(outDims));
|
||||
config.outConfs.push_back(outConfig);
|
||||
|
@ -112,8 +112,8 @@ public:
|
||||
_num_priors_actual = InferenceEngine::make_shared_blob<int>({Precision::I32, num_priors_actual_size, C});
|
||||
_num_priors_actual->allocate();
|
||||
|
||||
std::vector<DataConfigurator> in_data_conf(layer->insData.size(), DataConfigurator(ConfLayout::PLN));
|
||||
addConfig(layer, in_data_conf, {DataConfigurator(ConfLayout::PLN)});
|
||||
std::vector<DataConfigurator> in_data_conf(layer->insData.size(), DataConfigurator(ConfLayout::PLN, Precision::FP32));
|
||||
addConfig(layer, in_data_conf, {DataConfigurator(ConfLayout::PLN, Precision::FP32)});
|
||||
} catch (InferenceEngine::details::InferenceEngineException &ex) {
|
||||
errorMsg = ex.what();
|
||||
}
|
||||
|
@ -4,7 +4,6 @@
|
||||
|
||||
#include "embedding_bag_sum.hpp"
|
||||
#include "ie_parallel.hpp"
|
||||
#include "jit_generator.hpp"
|
||||
#include "list.hpp"
|
||||
|
||||
#include <set>
|
||||
|
@ -28,9 +28,6 @@ public:
|
||||
if (fill_dims.size() > 1)
|
||||
THROW_IE_EXCEPTION << layer->name << " Fill dimensions vector should be 1 dimension";
|
||||
|
||||
if (layer->insData[FILL_DIMS].lock()->getTensorDesc().getPrecision() != Precision::I32)
|
||||
THROW_IE_EXCEPTION << layer->name << " Fill dimensions vector should be I32!";
|
||||
|
||||
SizeVector value_dims = layer->insData[FILL_VALUE].lock()->getTensorDesc().getDims();
|
||||
if (value_dims.size() > 1)
|
||||
THROW_IE_EXCEPTION << layer->name << " Value scalar should have 1 dimension";
|
||||
@ -39,12 +36,12 @@ public:
|
||||
layer->outData[0]->getTensorDesc().getPrecision() == Precision::I32) &&
|
||||
!(layer->insData[FILL_VALUE].lock()->getTensorDesc().getPrecision() == Precision::FP32 &&
|
||||
layer->outData[0]->getTensorDesc().getPrecision() == Precision::FP32)) {
|
||||
THROW_IE_EXCEPTION << layer->name <<
|
||||
" 'Value' input scalars and output tensor should have same precision and only FP32 and I32 are supported!";
|
||||
addConfig(layer, { DataConfigurator(ConfLayout::PLN, Precision::I32), DataConfigurator(ConfLayout::PLN, Precision::FP32) },
|
||||
{ DataConfigurator(ConfLayout::PLN, Precision::FP32) });
|
||||
} else {
|
||||
addConfig(layer, { DataConfigurator(ConfLayout::PLN, Precision::I32), DataConfigurator(ConfLayout::PLN) },
|
||||
{ DataConfigurator(ConfLayout::PLN) });
|
||||
}
|
||||
|
||||
addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) },
|
||||
{ DataConfigurator(ConfLayout::PLN) });
|
||||
} catch (InferenceEngine::details::InferenceEngineException &ex) {
|
||||
errorMsg = ex.what();
|
||||
}
|
||||
|
@ -27,7 +27,7 @@ public:
|
||||
|
||||
Precision inIdxPrecision = layer->insData[GATHER_INDEXES].lock()->getTensorDesc().getPrecision();
|
||||
if (inIdxPrecision != Precision::FP32 && inIdxPrecision != Precision::I32 && inIdxPrecision != Precision::FP16)
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect input precision. Only FP32, FP16 or I32 are supported!";
|
||||
inIdxPrecision = Precision::I32;
|
||||
|
||||
axis = layer->GetParamAsInt("axis");
|
||||
|
||||
@ -52,7 +52,7 @@ public:
|
||||
|
||||
LayerConfig config;
|
||||
DataConfig dataConfigIdx, dataConfigDct;
|
||||
Precision dataPrecision = layer->outData[0]->getTensorDesc().getPrecision();
|
||||
Precision dataPrecision = layer->insData[GATHER_DICTIONARY].lock()->getTensorDesc().getPrecision();
|
||||
dataConfigDct.desc = TensorDesc(dataPrecision, dictionary_dims,
|
||||
layer->insData[GATHER_DICTIONARY].lock()->getTensorDesc().getLayoutByDims(dictionary_dims));
|
||||
config.inConfs.push_back(dataConfigDct);
|
||||
|
@ -30,9 +30,8 @@ public:
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect number of output edges.";
|
||||
|
||||
precision = layer->insData[GATHER_TREE_STEP_IDX].lock()->getTensorDesc().getPrecision();
|
||||
|
||||
if (precision != Precision::FP32 && precision != Precision::I32)
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect data tensor precision. Only I32 or FP32 are supported.";
|
||||
precision = Precision::FP32;
|
||||
|
||||
if (layer->insData[GATHER_TREE_PARENT_IDX].lock()->getTensorDesc().getPrecision() != precision ||
|
||||
layer->insData[GATHER_TREE_MAX_SEQ_LEN].lock()->getTensorDesc().getPrecision() != precision ||
|
||||
@ -49,9 +48,9 @@ public:
|
||||
if (layer->insData[GATHER_TREE_END_TOKEN].lock()->getTensorDesc().getDims().size() != 1)
|
||||
THROW_IE_EXCEPTION << layer->name << " end_token should be 1 dimension";
|
||||
|
||||
addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN),
|
||||
DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) },
|
||||
{ DataConfigurator(ConfLayout::PLN) });
|
||||
addConfig(layer, { DataConfigurator(ConfLayout::PLN, precision), DataConfigurator(ConfLayout::PLN, precision),
|
||||
DataConfigurator(ConfLayout::PLN, precision), DataConfigurator(ConfLayout::PLN, precision) },
|
||||
{ DataConfigurator(ConfLayout::PLN, precision) });
|
||||
} catch (InferenceEngine::details::InferenceEngineException &ex) {
|
||||
errorMsg = ex.what();
|
||||
}
|
||||
|
@ -22,7 +22,7 @@ public:
|
||||
|
||||
bias = layer->GetParamAsFloat("bias");
|
||||
|
||||
addConfig(layer, {{ConfLayout::PLN, false, 0}}, {{ConfLayout::PLN, false, 0}});
|
||||
addConfig(layer, {{ConfLayout::PLN, false, 0, Precision::FP32}}, {{ConfLayout::PLN, false, 0, Precision::FP32}});
|
||||
} catch (InferenceEngine::details::InferenceEngineException &ex) {
|
||||
errorMsg = ex.what();
|
||||
}
|
||||
|
@ -157,21 +157,13 @@ public:
|
||||
if (inData->getTensorDesc().getDims().size() != 4)
|
||||
THROW_IE_EXCEPTION << "Interp supports only 4d blobs!";
|
||||
|
||||
auto src_precision = inData->getTensorDesc().getPrecision();
|
||||
if (src_precision != Precision::FP32 && src_precision != Precision::U8 && src_precision != Precision::BF16)
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect input data tensor precision. Only U8 or FP32 or BF16 are supported!";
|
||||
|
||||
auto dst_precision = layer->outData[0]->getTensorDesc().getPrecision();
|
||||
if (dst_precision != Precision::FP32 && dst_precision != Precision::BF16)
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect output data tensor precision. Only FP32 or BF16 are supported!";
|
||||
|
||||
// We don't read other parameters since they are needed only for dst reshape in caffe
|
||||
pad_beg = layer->GetParamAsInt("pad_beg");
|
||||
pad_end = layer->GetParamAsInt("pad_end");
|
||||
align_corners = layer->GetParamAsBool("align_corners", true);
|
||||
|
||||
ConfLayout blk_layout;
|
||||
if (src_precision == Precision::U8) {
|
||||
if (inData->getTensorDesc().getPrecision() == Precision::U8) {
|
||||
LayerConfig config;
|
||||
DataConfig dataConfigDct;
|
||||
dataConfigDct.desc = TensorDesc(Precision::U8, inData->getTensorDesc().getDims(), Layout::NCHW);
|
||||
@ -197,15 +189,15 @@ public:
|
||||
if (mayiuse(avx512_common)) {
|
||||
blk_layout = ConfLayout::BLK16;
|
||||
interp_kernel.reset(new jit_uni_interp_kernel_f32<avx512_common>());
|
||||
addConfig(layer, { DataConfigurator(blk_layout) }, { DataConfigurator(blk_layout) });
|
||||
addConfig(layer, { DataConfigurator(blk_layout, Precision::FP32) }, { DataConfigurator(blk_layout, Precision::FP32) });
|
||||
} else if (mayiuse(avx2)) {
|
||||
blk_layout = ConfLayout::BLK8;
|
||||
interp_kernel.reset(new jit_uni_interp_kernel_f32<avx2>());
|
||||
addConfig(layer, { DataConfigurator(blk_layout) }, { DataConfigurator(blk_layout) });
|
||||
addConfig(layer, { DataConfigurator(blk_layout, Precision::FP32) }, { DataConfigurator(blk_layout, Precision::FP32) });
|
||||
} else {
|
||||
blk_layout = ConfLayout::BLK8;
|
||||
interp_kernel.reset(new jit_uni_interp_kernel_f32<sse42>());
|
||||
addConfig(layer, { DataConfigurator(blk_layout) }, { DataConfigurator(blk_layout) });
|
||||
addConfig(layer, { DataConfigurator(blk_layout, Precision::FP32) }, { DataConfigurator(blk_layout, Precision::FP32) });
|
||||
}
|
||||
}
|
||||
} catch (InferenceEngine::details::InferenceEngineException &ex) {
|
||||
|
@ -51,7 +51,7 @@ public:
|
||||
for (size_t i = (axis + 1); i < dims.size(); i++)
|
||||
reduced_axis_stride *= dims[i];
|
||||
|
||||
addConfig(layer, { { ConfLayout::PLN, false, 0 } }, { { ConfLayout::PLN, false, 0 } });
|
||||
addConfig(layer, { { ConfLayout::PLN, false, 0, Precision::FP32 } }, { { ConfLayout::PLN, false, 0, Precision::FP32 } });
|
||||
} catch (InferenceEngine::details::InferenceEngineException &ex) {
|
||||
errorMsg = ex.what();
|
||||
}
|
||||
|
@ -86,7 +86,7 @@ public:
|
||||
else
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect Math layer type!";
|
||||
|
||||
addConfig(layer, {DataConfigurator(ConfLayout::PLN, Precision::FP32)}, {DataConfigurator(ConfLayout::PLN, Precision::FP32)});
|
||||
addConfig(layer, {DataConfigurator(ConfLayout::PLN, false, 0, Precision::FP32)}, {DataConfigurator(ConfLayout::PLN, false, 0, Precision::FP32)});
|
||||
} catch (InferenceEngine::details::InferenceEngineException &ex) {
|
||||
errorMsg = ex.what();
|
||||
}
|
||||
|
@ -58,13 +58,12 @@ void MKLDNNCropNode::initSupportedPrimitiveDescriptors() {
|
||||
return;
|
||||
|
||||
InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision();
|
||||
if (precision != InferenceEngine::Precision::FP32)
|
||||
precision = InferenceEngine::Precision::FP32;
|
||||
auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
|
||||
precision = getCnnLayer()->outData[0]->getPrecision();
|
||||
if (precision != InferenceEngine::Precision::FP32)
|
||||
precision = InferenceEngine::Precision::FP32;
|
||||
auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
|
||||
if (inputDataType != outputDataType) {
|
||||
outputDataType = inputDataType; // Crop doesn't convert precisions, only moves data
|
||||
}
|
||||
|
||||
auto& inDims = getParentEdgeAt(0)->getDims();
|
||||
if (inDims.ndims() != 2 && inDims.ndims() != 4 && inDims.ndims() != 5) {
|
||||
@ -125,19 +124,19 @@ void MKLDNNCropNode::execute(mkldnn::stream strm) {
|
||||
if (!MKLDNNMemory::IsPlainFormat(parentMem.GetFormat())) {
|
||||
m_block_size = parentMem.GetDescriptor().data.layout_desc.blocking.block_dims[1];
|
||||
}
|
||||
int m_inner_dim = dims[dims.size() - 1] * m_block_size;
|
||||
const int m_inner_dim = dims[dims.size() - 1] * m_block_size;
|
||||
|
||||
const memory &dst_d = getChildEdgeAt(0)->getMemory().GetPrimitive();
|
||||
|
||||
int dst_ndims = dst_d.get_primitive_desc().desc().data.ndims;
|
||||
const int dst_ndims = dst_d.get_primitive_desc().desc().data.ndims;
|
||||
|
||||
// TODO: Rewrite it in general case. For every tensor
|
||||
// and rank, without using letter N,C,D,H,W
|
||||
int OFFSET_N = (dst_ndims > 0) ? offsets[0] : 0;
|
||||
int OFFSET_C = (dst_ndims > 1) ? offsets[1] : 0;
|
||||
int OFFSET_D = (dst_ndims > 4) ? offsets[offsets.size() - 3] : 0;
|
||||
int OFFSET_H = (dst_ndims > 2) ? offsets[offsets.size() - 2] : 0;
|
||||
int OFFSET_W = (dst_ndims > 3) ? offsets[offsets.size() - 1] : 0;
|
||||
const int OFFSET_N = (dst_ndims > 0) ? offsets[0] : 0;
|
||||
const int OFFSET_C = (dst_ndims > 1) ? offsets[1] : 0;
|
||||
const int OFFSET_D = (dst_ndims > 4) ? offsets[offsets.size() - 3] : 0;
|
||||
const int OFFSET_H = (dst_ndims > 2) ? offsets[offsets.size() - 2] : 0;
|
||||
const int OFFSET_W = (dst_ndims > 3) ? offsets[offsets.size() - 1] : 0;
|
||||
|
||||
// TODO: Check applicability of dyn_batch_lim in early steps.
|
||||
// crop of batch dimension doesn't support dyn batch.
|
||||
@ -155,42 +154,16 @@ void MKLDNNCropNode::execute(mkldnn::stream strm) {
|
||||
const int IH = (src_ndims > 2) ? src_dims[src_dims.size() - 2] : 1;
|
||||
const int IW = (src_ndims > 3) ? src_dims[src_dims.size() - 1] : 1;
|
||||
|
||||
const auto *src_data = reinterpret_cast<const float*>(parentMem.GetData()) +
|
||||
parentMem.GetDescriptor().data.layout_desc.blocking.offset_padding;
|
||||
float *dst_data = reinterpret_cast<float*>(getChildEdgeAt(0)->getMemory().GetData()) +
|
||||
getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
|
||||
const uint8_t itemSize = MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(parentMem.GetDataType()));
|
||||
|
||||
#ifdef _WIN32
|
||||
if (OD == 1 && OH == 1 && OW == 1 && ID == 1 && IH == 1 && IW == 1) {
|
||||
for (int n = 0; n < ON; ++n) {
|
||||
cpu_memcpy(&dst_data[n*OC], &src_data[(n+OFFSET_N)*IC + OFFSET_C], OC * sizeof(float));
|
||||
}
|
||||
} else {
|
||||
for (int n = 0; n < ON; ++n) {
|
||||
for (int c = 0; c < OC; c += m_block_size) {
|
||||
for (int d = 0; d < OD; ++d) {
|
||||
for (int h = 0; h < OH; ++h) {
|
||||
int dst_ind =
|
||||
n*OC*OD*OH*OW + c*OD*OH*OW + d*OH*OW*m_block_size +
|
||||
h*OW*m_block_size;
|
||||
const auto *src_data = reinterpret_cast<const uint8_t *>(parentMem.GetData()) +
|
||||
itemSize * parentMem.GetDescriptor().data.layout_desc.blocking.offset_padding;
|
||||
auto *dst_data = reinterpret_cast<uint8_t*>(getChildEdgeAt(0)->getMemory().GetData()) +
|
||||
itemSize * getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
|
||||
|
||||
int src_ind =
|
||||
(n+OFFSET_N)*IC*ID*IH*IW +
|
||||
(c+OFFSET_C)*ID*IH*IW +
|
||||
(d+OFFSET_D)*IH*IW*m_block_size +
|
||||
(h+OFFSET_H)*IW*m_block_size +
|
||||
OFFSET_W*m_block_size;
|
||||
|
||||
cpu_memcpy(dst_data + dst_ind, src_data + src_ind, m_inner_dim * sizeof(float));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (OD == 1 && OH == 1 && OW == 1 && ID == 1 && IH == 1 && IW == 1) {
|
||||
parallel_for(ON, [&](int n) {
|
||||
cpu_memcpy(&dst_data[n*OC], &src_data[(n+OFFSET_N)*IC + OFFSET_C], OC * sizeof(float));
|
||||
cpu_memcpy(dst_data + itemSize * n * OC, src_data + itemSize *((n+OFFSET_N)*IC + OFFSET_C), OC * itemSize);
|
||||
});
|
||||
} else {
|
||||
parallel_for2d(ON, (OC / m_block_size), [&](int n, int c) {
|
||||
@ -201,7 +174,7 @@ void MKLDNNCropNode::execute(mkldnn::stream strm) {
|
||||
((d+OFFSET_D)*IH*IW + OFFSET_H*IW + OFFSET_W)*m_block_size;
|
||||
|
||||
for (int h = 0; h < OH; ++h) {
|
||||
cpu_memcpy(dst_data + dst_ind, src_data + src_ind, m_inner_dim * sizeof(float));
|
||||
cpu_memcpy(dst_data + itemSize * dst_ind, src_data + itemSize * src_ind, m_inner_dim * itemSize);
|
||||
|
||||
src_ind += IW * m_block_size;
|
||||
dst_ind += OW * m_block_size;
|
||||
@ -209,7 +182,6 @@ void MKLDNNCropNode::execute(mkldnn::stream strm) {
|
||||
}
|
||||
});
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
bool MKLDNNCropNode::created() const {
|
||||
|
@ -12,6 +12,7 @@
|
||||
#include <vector>
|
||||
#include <mkldnn_types.h>
|
||||
#include <mkldnn_extension_utils.h>
|
||||
#include "utils/bfloat16.hpp"
|
||||
#include <legacy/ie_layers_internal.hpp>
|
||||
#include "ie_parallel.hpp"
|
||||
#include <algorithm>
|
||||
@ -31,6 +32,15 @@ using namespace Xbyak;
|
||||
|
||||
#define GET_OFF(field) offsetof(jit_mvn_call_args, field)
|
||||
|
||||
// some utility functions
|
||||
static inline bool isFloatCompatible(Precision prc) {
|
||||
return Precision::FP32 == prc || Precision::BF16 == prc;
|
||||
}
|
||||
|
||||
static inline bool isFloatCompatible(memory::data_type type) {
|
||||
return memory::f32 == type || memory::bf16 == type;
|
||||
}
|
||||
|
||||
// normalize_variance = false : src->mean
|
||||
// normalize_variance = true : src+mean->variance:sqr(x-mean)
|
||||
template <cpu_isa_t isa>
|
||||
@ -88,13 +98,13 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k
|
||||
load_vector(vmm_val, ptr[reg_src], jcp_.src_dt);
|
||||
|
||||
if (jcp_.normalize_variance) {
|
||||
if (jcp_.src_dt != memory::f32)
|
||||
if (!isFloatCompatible(jcp_.src_dt))
|
||||
uni_vcvtdq2ps(vmm_val, vmm_val);
|
||||
|
||||
uni_vsubps(vmm_val, vmm_val, vmm_mean);
|
||||
uni_vfmadd231ps(vmm_variance, vmm_val, vmm_val);
|
||||
} else {
|
||||
if (jcp_.src_dt != memory::f32)
|
||||
if (!isFloatCompatible(jcp_.src_dt))
|
||||
uni_vpaddd(vmm_sum, vmm_sum, vmm_val);
|
||||
else
|
||||
uni_vaddps(vmm_sum, vmm_sum, vmm_val);
|
||||
@ -138,7 +148,7 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k
|
||||
|
||||
uni_vmovups(ptr[reg_variance], vmm_variance);
|
||||
} else {
|
||||
if (jcp_.src_dt != memory::f32)
|
||||
if (!isFloatCompatible(jcp_.src_dt))
|
||||
uni_vcvtdq2ps(vmm_sum, vmm_sum);
|
||||
|
||||
if (!jcp_.planar_layout && !jcp_.across_channels) {
|
||||
@ -199,6 +209,10 @@ private:
|
||||
case memory::u8:
|
||||
uni_vpmovzxbd(vmm_src, op);
|
||||
break;
|
||||
case memory::bf16:
|
||||
uni_vpmovzxwd(vmm_src, op);
|
||||
uni_vpslld(vmm_src, vmm_src, 16);
|
||||
break;
|
||||
default:
|
||||
assert(!"unknown dst_dt");
|
||||
}
|
||||
@ -348,11 +362,15 @@ private:
|
||||
case memory::u8:
|
||||
uni_vpmovzxbd(vmm_src, op);
|
||||
break;
|
||||
case memory::bf16:
|
||||
uni_vpmovzxwd(vmm_src, op);
|
||||
uni_vpslld(vmm_src, vmm_src, 16);
|
||||
break;
|
||||
default:
|
||||
assert(!"unknown dst_dt");
|
||||
}
|
||||
|
||||
if (src_dt != memory::f32)
|
||||
if (!isFloatCompatible(src_dt))
|
||||
uni_vcvtdq2ps(vmm_src, vmm_src);
|
||||
}
|
||||
|
||||
@ -362,6 +380,9 @@ private:
|
||||
|
||||
if (dst_dt == memory::f32) {
|
||||
uni_vmovups(op, vmm_dst);
|
||||
} else if (dst_dt == memory::bf16) {
|
||||
vcvtneps2bf16(ymm_dst, vmm_dst);
|
||||
uni_vmovups(op, ymm_dst);
|
||||
} else if (dst_dt == memory::u8) {
|
||||
uni_vcvtps2dq(vmm_dst, vmm_dst);
|
||||
if (isa == cpu::avx512_common) {
|
||||
@ -413,7 +434,7 @@ private:
|
||||
depthwise_inj_idx++;
|
||||
} else if (post_op.is_quantization()) {
|
||||
bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize;
|
||||
bool do_rounding = do_dequantization || dst_dt == memory::f32 || i != p.len_ - 1;
|
||||
bool do_rounding = do_dequantization || isFloatCompatible(dst_dt) || i != p.len_ - 1;
|
||||
int s_idx = vmm_val.getIdx();
|
||||
|
||||
quantization_injectors[quantization_inj_idx]->init_crop_ptrs(reg_oc_off);
|
||||
@ -475,8 +496,17 @@ void MKLDNNMVNNode::initSupportedPrimitiveDescriptors() {
|
||||
|
||||
if (getParentEdgeAt(0)->getDims().ndims() < 4 || getParentEdgeAt(0)->getDims().ndims() > 5
|
||||
|| across_channels != 0 || normalize_variance != 1) {
|
||||
inputPrecision = Precision::FP32;
|
||||
outputPrecision = Precision::FP32;
|
||||
if (!isFloatCompatible(inputPrecision)) {
|
||||
inputPrecision = Precision::FP32;
|
||||
}
|
||||
if (!isFloatCompatible(outputPrecision)) {
|
||||
outputPrecision = Precision::FP32;
|
||||
}
|
||||
}
|
||||
|
||||
if (!mayiuse(avx512_core_bf16)) {
|
||||
if (outputPrecision == Precision::BF16)
|
||||
outputPrecision = Precision::FP32;
|
||||
}
|
||||
|
||||
auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(inputPrecision);
|
||||
@ -498,39 +528,50 @@ void MKLDNNMVNNode::initSupportedPrimitiveDescriptors() {
|
||||
config.inConfs[0].inPlace = -1;
|
||||
config.outConfs[0].inPlace = canBeInplace ? 0 : -1;
|
||||
|
||||
auto pushDesc = [&](memory::format format) {
|
||||
auto pushDesc = [&](memory::format format, impl_desc_type impl_type) {
|
||||
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, format);
|
||||
config.outConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), outputDataType, format);
|
||||
supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, format});
|
||||
supportedPrimitiveDescriptors.push_back({config, impl_type, format});
|
||||
};
|
||||
|
||||
impl_desc_type impl_type;
|
||||
if (mayiuse(cpu::avx512_common)) {
|
||||
impl_type = impl_desc_type::jit_avx512;
|
||||
} else if (mayiuse(cpu::avx2)) {
|
||||
impl_type = impl_desc_type::jit_avx2;
|
||||
} else if (mayiuse(cpu::sse42)) {
|
||||
impl_type = impl_desc_type::jit_sse42;
|
||||
} else {
|
||||
impl_type = impl_desc_type::ref;
|
||||
}
|
||||
|
||||
if (across_channels == 0 && normalize_variance == 1) {
|
||||
if (getParentEdgeAt(0)->getDims().ndims() == 4) {
|
||||
pushDesc(memory::nhwc);
|
||||
pushDesc(memory::nhwc, impl_type);
|
||||
} else if (getParentEdgeAt(0)->getDims().ndims() == 5) {
|
||||
pushDesc(memory::ndhwc);
|
||||
pushDesc(memory::ndhwc, impl_type);
|
||||
}
|
||||
}
|
||||
|
||||
if (inputPrecision == Precision::FP32 && outputPrecision == Precision::FP32) {
|
||||
if (getParentEdgeAt(0)->getDims().ndims() == 4) {
|
||||
if (mayiuse(cpu::avx512_common)) {
|
||||
pushDesc(memory::nChw16c);
|
||||
} else if (mayiuse(cpu::avx2) || mayiuse(cpu::sse42)) {
|
||||
pushDesc(memory::nChw8c);
|
||||
if (isFloatCompatible(inputPrecision) && isFloatCompatible(outputPrecision)) {
|
||||
if (impl_desc_type::jit_avx512 == impl_type) {
|
||||
if (getParentEdgeAt(0)->getDims().ndims() == 4) {
|
||||
pushDesc(memory::nChw16c, impl_type);
|
||||
} else if (getParentEdgeAt(0)->getDims().ndims() == 5) {
|
||||
pushDesc(memory::nCdhw16c, impl_type);
|
||||
}
|
||||
} else if (getParentEdgeAt(0)->getDims().ndims() == 5) {
|
||||
if (mayiuse(cpu::avx512_common)) {
|
||||
pushDesc(memory::nCdhw16c);
|
||||
} else if (mayiuse(cpu::avx2) || mayiuse(cpu::sse42)) {
|
||||
pushDesc(memory::nCdhw8c);
|
||||
} else if (impl_desc_type::jit_avx2 == impl_type || impl_desc_type::jit_sse42 == impl_type) {
|
||||
if (getParentEdgeAt(0)->getDims().ndims() == 4) {
|
||||
pushDesc(memory::nChw8c, impl_type);
|
||||
} else if (getParentEdgeAt(0)->getDims().ndims() == 5) {
|
||||
pushDesc(memory::nCdhw8c, impl_type);
|
||||
}
|
||||
}
|
||||
|
||||
if (fusedWith.empty()) {
|
||||
if (canBeInplace)
|
||||
config.inConfs[0].inPlace = 0;
|
||||
pushDesc(MKLDNNMemory::GetPlainFormat(getChildEdgeAt(0)->getDims()));
|
||||
pushDesc(MKLDNNMemory::GetPlainFormat(getChildEdgeAt(0)->getDims()), impl_type);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -614,11 +655,32 @@ void MKLDNNMVNNode::execute(mkldnn::stream strm) {
|
||||
|
||||
Layout layout = getParentEdgeAt(0)->getDesc().getLayout();
|
||||
|
||||
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
|
||||
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
|
||||
|
||||
if (layout == C || layout == NC || layout == CHW || layout == NCHW || layout == NCDHW) {
|
||||
mvn_pln(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims());
|
||||
if (input_prec == Precision::FP32) {
|
||||
auto src_data = reinterpret_cast<float*>(srcMemPtr->GetData());
|
||||
if (output_prec == Precision::FP32) {
|
||||
auto dst_data = reinterpret_cast<float*>(dstMemPtr->GetData());
|
||||
mvn_pln(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims());
|
||||
} else if (output_prec == Precision::BF16) {
|
||||
auto dst_data = reinterpret_cast<bfloat16_t*>(dstMemPtr->GetData());
|
||||
mvn_pln(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims());
|
||||
} else {
|
||||
THROW_IE_EXCEPTION << "Unsupported output precision: " << output_prec.name();
|
||||
}
|
||||
} else if (input_prec == Precision::BF16) {
|
||||
auto src_data = reinterpret_cast<bfloat16_t*>(srcMemPtr->GetData());
|
||||
if (output_prec == Precision::FP32) {
|
||||
auto dst_data = reinterpret_cast<float*>(dstMemPtr->GetData());
|
||||
mvn_pln(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims());
|
||||
} else if (output_prec == Precision::BF16) {
|
||||
auto dst_data = reinterpret_cast<bfloat16_t*>(dstMemPtr->GetData());
|
||||
mvn_pln(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims());
|
||||
} else {
|
||||
THROW_IE_EXCEPTION << "Unsupported output precision: " << output_prec.name();
|
||||
}
|
||||
} else {
|
||||
THROW_IE_EXCEPTION << "Unsupported input precision: " << input_prec.name();
|
||||
}
|
||||
} else {
|
||||
if (output_prec == Precision::U8) {
|
||||
auto dst_data = reinterpret_cast<uint8_t *>(dstMemPtr->GetData());
|
||||
@ -631,6 +693,11 @@ void MKLDNNMVNNode::execute(mkldnn::stream strm) {
|
||||
} else if (input_prec == Precision::FP32) {
|
||||
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
|
||||
mvn_blk<float, uint8_t>(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims());
|
||||
} else if (input_prec == Precision::BF16) {
|
||||
auto src_data = reinterpret_cast<const bfloat16_t *>(srcMemPtr->GetData());
|
||||
mvn_blk<bfloat16_t, uint8_t>(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims());
|
||||
} else {
|
||||
THROW_IE_EXCEPTION << "Unsupported input precision: " << input_prec.name();
|
||||
}
|
||||
} else if (output_prec == Precision::I8) {
|
||||
auto dst_data = reinterpret_cast<int8_t *>(dstMemPtr->GetData());
|
||||
@ -643,6 +710,11 @@ void MKLDNNMVNNode::execute(mkldnn::stream strm) {
|
||||
} else if (input_prec == Precision::FP32) {
|
||||
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
|
||||
mvn_blk<float, int8_t>(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims());
|
||||
} else if (input_prec == Precision::BF16) {
|
||||
auto src_data = reinterpret_cast<const bfloat16_t *>(srcMemPtr->GetData());
|
||||
mvn_blk<bfloat16_t, int8_t>(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims());
|
||||
} else {
|
||||
THROW_IE_EXCEPTION << "Unsupported input precision: " << input_prec.name();
|
||||
}
|
||||
} else if (output_prec == Precision::FP32) {
|
||||
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
|
||||
@ -655,7 +727,31 @@ void MKLDNNMVNNode::execute(mkldnn::stream strm) {
|
||||
} else if (input_prec == Precision::FP32) {
|
||||
auto src_data = reinterpret_cast<float *>(srcMemPtr->GetData());
|
||||
mvn_blk<float, float>(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims());
|
||||
} else if (input_prec == Precision::BF16) {
|
||||
auto src_data = reinterpret_cast<const bfloat16_t*>(srcMemPtr->GetData());
|
||||
mvn_blk<bfloat16_t, float>(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims());
|
||||
} else {
|
||||
THROW_IE_EXCEPTION << "Unsupported input precision: " << input_prec.name();
|
||||
}
|
||||
} else if (output_prec == Precision::BF16) {
|
||||
auto dst_data = reinterpret_cast<bfloat16_t*>(dstMemPtr->GetData());
|
||||
if (input_prec == Precision::U8) {
|
||||
auto src_data = reinterpret_cast<const uint8_t *>(srcMemPtr->GetData());
|
||||
mvn_blk<uint8_t, bfloat16_t>(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims());
|
||||
} else if (input_prec == Precision::I8) {
|
||||
auto src_data = reinterpret_cast<const int8_t *>(srcMemPtr->GetData());
|
||||
mvn_blk<int8_t, bfloat16_t>(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims());
|
||||
} else if (input_prec == Precision::FP32) {
|
||||
auto src_data = reinterpret_cast<float *>(srcMemPtr->GetData());
|
||||
mvn_blk<float, bfloat16_t>(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims());
|
||||
} else if (input_prec == Precision::BF16) {
|
||||
auto src_data = reinterpret_cast<const bfloat16_t*>(srcMemPtr->GetData());
|
||||
mvn_blk<bfloat16_t, bfloat16_t>(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims());
|
||||
} else {
|
||||
THROW_IE_EXCEPTION << "Unsupported input precision: " << input_prec.name();
|
||||
}
|
||||
} else {
|
||||
THROW_IE_EXCEPTION << "Unsupported output precision: " << output_prec.name();
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -673,7 +769,8 @@ std::tuple<size_t, size_t, size_t, size_t, size_t> MKLDNNMVNNode::get5dShapes(co
|
||||
return shapes;
|
||||
}
|
||||
|
||||
void MKLDNNMVNNode::mvn_pln(const float* src_data, float* dst_data, const SizeVector& dims) {
|
||||
template <typename in_data_t, typename out_data_t>
|
||||
void MKLDNNMVNNode::mvn_pln(const in_data_t* src_data, out_data_t* dst_data, const SizeVector& dims) {
|
||||
size_t blk_size = 1; // blk size in vmm
|
||||
if (mayiuse(cpu::avx512_common)) {
|
||||
blk_size = 16;
|
||||
@ -705,7 +802,7 @@ void MKLDNNMVNNode::mvn_pln(const float* src_data, float* dst_data, const SizeVe
|
||||
auto arg = jit_mvn_call_args();
|
||||
arg.src = src_data + cc;
|
||||
arg.sum = static_cast<float*>(&mean_internal);
|
||||
arg.src_stride = static_cast<size_t>(blk_size * sizeof(float));
|
||||
arg.src_stride = static_cast<size_t>(blk_size * sizeof(in_data_t));
|
||||
arg.work_amount = static_cast<size_t>(C2 / blk_size);
|
||||
(*mvn_mean_kernel)(&arg);
|
||||
for (size_t tail = tail_across_channels; tail < C2; tail++) {
|
||||
@ -737,7 +834,7 @@ void MKLDNNMVNNode::mvn_pln(const float* src_data, float* dst_data, const SizeVe
|
||||
arg.src = src_data + cc;
|
||||
arg.mean = static_cast<float*>(&mean);
|
||||
arg.variance = static_cast<float*>(&variance_internal);
|
||||
arg.src_stride = static_cast<size_t>(blk_size * sizeof(float));
|
||||
arg.src_stride = static_cast<size_t>(blk_size * sizeof(in_data_t));
|
||||
arg.work_amount = static_cast<size_t>(C2 / blk_size);
|
||||
(*mvn_variance_kernel)(&arg);
|
||||
|
||||
@ -766,8 +863,8 @@ void MKLDNNMVNNode::mvn_pln(const float* src_data, float* dst_data, const SizeVe
|
||||
arg.dst = dst_data + cc;
|
||||
arg.mean = static_cast<float*>(&mean);
|
||||
arg.variance = static_cast<float*>(&variance);
|
||||
arg.src_stride = static_cast<size_t>(blk_size * sizeof(float));
|
||||
arg.dst_stride = static_cast<size_t>(blk_size * sizeof(float));
|
||||
arg.src_stride = static_cast<size_t>(blk_size * sizeof(in_data_t));
|
||||
arg.dst_stride = static_cast<size_t>(blk_size * sizeof(out_data_t));
|
||||
arg.work_amount = static_cast<size_t>(C2 / blk_size);
|
||||
(*mvn_kernel)(&arg);
|
||||
|
||||
@ -792,8 +889,8 @@ void MKLDNNMVNNode::mvn_pln(const float* src_data, float* dst_data, const SizeVe
|
||||
arg.src = src_data + cc;
|
||||
arg.dst = dst_data + cc;
|
||||
arg.mean = static_cast<float*>(&mean);
|
||||
arg.src_stride = static_cast<size_t>(blk_size * sizeof(float));
|
||||
arg.dst_stride = static_cast<size_t>(blk_size * sizeof(float));
|
||||
arg.src_stride = static_cast<size_t>(blk_size * sizeof(in_data_t));
|
||||
arg.dst_stride = static_cast<size_t>(blk_size * sizeof(out_data_t));
|
||||
arg.work_amount = static_cast<size_t>(C2 / blk_size);
|
||||
(*mvn_kernel)(&arg);
|
||||
|
||||
@ -823,8 +920,8 @@ void MKLDNNMVNNode::mvn_pln(const float* src_data, float* dst_data, const SizeVe
|
||||
arg.src = src_data + cc;
|
||||
arg.dst = dst_data + cc;
|
||||
arg.sum = static_cast<float*>(&mean);
|
||||
arg.src_stride = static_cast<size_t>(blk_size * sizeof(float));
|
||||
arg.dst_stride = static_cast<size_t>(blk_size * sizeof(float));
|
||||
arg.src_stride = static_cast<size_t>(blk_size * sizeof(in_data_t));
|
||||
arg.dst_stride = static_cast<size_t>(blk_size * sizeof(out_data_t));
|
||||
arg.work_amount = static_cast<size_t>(C2 / blk_size);
|
||||
(*mvn_mean_kernel)(&arg);
|
||||
|
||||
@ -1227,7 +1324,7 @@ void MKLDNNMVNNode::mvn_blk(const in_data_t* src_data, out_data_t* dst_data, con
|
||||
} else if (post_op.is_quantization()) {
|
||||
bool do_dequantization = post_op.quantization.alg ==
|
||||
alg_kind::quantization_quantize_dequantize;
|
||||
bool do_rounding = do_dequantization || output_prec == Precision::FP32 ||
|
||||
bool do_rounding = do_dequantization || isFloatCompatible(output_prec) ||
|
||||
i != p.len_ - 1;
|
||||
|
||||
auto quant = post_op.quantization;
|
||||
@ -1251,7 +1348,7 @@ void MKLDNNMVNNode::mvn_blk(const in_data_t* src_data, out_data_t* dst_data, con
|
||||
}
|
||||
}
|
||||
}
|
||||
if (output_prec == Precision::FP32) {
|
||||
if (isFloatCompatible(output_prec)) {
|
||||
dst_data[ch + w * src_stride] = dst_value;
|
||||
} else if (output_prec == Precision::U8) {
|
||||
dst_data[ch + w * src_stride] = (dst_value >= 0) ? lroundf(dst_value) : 0;
|
||||
@ -1300,7 +1397,7 @@ void MKLDNNMVNNode::mvn_blk(const in_data_t* src_data, out_data_t* dst_data, con
|
||||
size_t ch = cd + h * C0;
|
||||
for (size_t w = 0lu; w < W; w++) {
|
||||
float dst_value = src_data[ch + w * src_stride] - mean_buffer_ptr[c];
|
||||
if (output_prec == Precision::FP32) {
|
||||
if (isFloatCompatible(output_prec)) {
|
||||
dst_data[ch + w * src_stride] = dst_value;
|
||||
} else if (output_prec == Precision::U8) {
|
||||
dst_data[ch + w * src_stride] = (dst_value >= 0) ? lroundf(dst_value) : 0;
|
||||
|
@ -81,7 +81,8 @@ public:
|
||||
}
|
||||
|
||||
private:
|
||||
void mvn_pln(const float* src_data, float* dst_data, const InferenceEngine::SizeVector& dims);
|
||||
template <typename in_data_t, typename out_data_t>
|
||||
void mvn_pln(const in_data_t* src_data, out_data_t* dst_data, const InferenceEngine::SizeVector& dims);
|
||||
|
||||
template <typename in_data_t, typename out_data_t>
|
||||
void mvn_blk(const in_data_t* src_data, out_data_t* dst_data, const InferenceEngine::SizeVector& dims);
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include "mkldnn_quantize_node.h"
|
||||
#include "mkldnn_eltwise_node.h"
|
||||
#include <mkldnn_extension_utils.h>
|
||||
#include "utils/bfloat16.hpp"
|
||||
#include <legacy/ie_layers_internal.hpp>
|
||||
#include "ie_parallel.hpp"
|
||||
#include "jit_uni_eltwise.hpp"
|
||||
@ -24,6 +25,10 @@ using namespace Xbyak;
|
||||
|
||||
#define GET_OFF(field) offsetof(jit_normalize_call_args, field)
|
||||
|
||||
static inline bool isFloatCompatible(memory::data_type type) {
|
||||
return memory::f32 == type || memory::bf16 == type;
|
||||
}
|
||||
|
||||
template <cpu_isa_t isa>
|
||||
struct jit_uni_normalize_modulo_kernel_f32 : public jit_uni_normalize_modulo_kernel, public jit_generator {
|
||||
DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_normalize_modulo_kernel_f32)
|
||||
@ -119,6 +124,10 @@ private:
|
||||
case memory::s32:
|
||||
uni_vmovups(vmm_src, op);
|
||||
break;
|
||||
case memory::bf16:
|
||||
uni_vpmovzxwd(vmm_src, op);
|
||||
uni_vpslld(vmm_src, vmm_src, 16);
|
||||
break;
|
||||
case memory::s8:
|
||||
uni_vpmovsxbd(vmm_src, op);
|
||||
break;
|
||||
@ -128,8 +137,7 @@ private:
|
||||
default:
|
||||
assert(!"unknown dst_dt");
|
||||
}
|
||||
|
||||
if (src_dt != memory::f32)
|
||||
if (!isFloatCompatible(src_dt))
|
||||
uni_vcvtdq2ps(vmm_src, vmm_src);
|
||||
}
|
||||
};
|
||||
@ -239,7 +247,7 @@ private:
|
||||
Xbyak::Label tail_loop_label;
|
||||
Xbyak::Label tail_loop_end_label;
|
||||
|
||||
int step = vlen / sizeof(float);
|
||||
int step = jcp_.src_dt == memory::bf16 ? 16 : (vlen / sizeof(float));
|
||||
L(main_loop_label);
|
||||
{
|
||||
cmp(reg_work_amount, step);
|
||||
@ -322,7 +330,7 @@ private:
|
||||
Xbyak::Label tail_loop_label;
|
||||
Xbyak::Label tail_loop_end_label;
|
||||
|
||||
int step = vlen / sizeof(float);
|
||||
int step = jcp_.src_dt == memory::bf16 ? 16 : (vlen / sizeof(float));
|
||||
L(main_loop_label);
|
||||
{
|
||||
cmp(reg_work_amount, step);
|
||||
@ -520,6 +528,10 @@ private:
|
||||
case memory::s32:
|
||||
uni_vmovups(vmm_src, op);
|
||||
break;
|
||||
case memory::bf16:
|
||||
uni_vpmovzxwd(vmm_src, op);
|
||||
uni_vpslld(vmm_src, vmm_src, 16);
|
||||
break;
|
||||
case memory::s8:
|
||||
uni_vpmovsxbd(vmm_src, op);
|
||||
break;
|
||||
@ -529,8 +541,7 @@ private:
|
||||
default:
|
||||
assert(!"unknown dst_dt");
|
||||
}
|
||||
|
||||
if (src_dt != memory::f32)
|
||||
if (!isFloatCompatible(src_dt))
|
||||
uni_vcvtdq2ps(vmm_src, vmm_src);
|
||||
}
|
||||
|
||||
@ -540,6 +551,10 @@ private:
|
||||
case memory::s32:
|
||||
movss(xmm_src, op);
|
||||
break;
|
||||
case memory::bf16:
|
||||
pinsrw(xmm_src, op, 0x0);
|
||||
uni_vpslld(xmm_src, xmm_src, 16);
|
||||
break;
|
||||
case memory::s8:
|
||||
movsx(reg_tmp_32, op);
|
||||
movq(xmm_src, reg_tmp_64);
|
||||
@ -552,7 +567,7 @@ private:
|
||||
assert(!"unknown dst_dt");
|
||||
}
|
||||
|
||||
if (src_dt != data_type::f32) {
|
||||
if (!isFloatCompatible(src_dt)) {
|
||||
uni_vcvtdq2ps(xmm_src, xmm_src);
|
||||
}
|
||||
}
|
||||
@ -563,6 +578,9 @@ private:
|
||||
|
||||
if (dst_dt == memory::f32) {
|
||||
uni_vmovups(op, vmm_dst);
|
||||
} else if (dst_dt == memory::bf16) {
|
||||
vcvtneps2bf16(ymm_dst, vmm_dst);
|
||||
vmovdqu16(op, ymm_dst);
|
||||
} else if (dst_dt == memory::u8) {
|
||||
uni_vcvtps2dq(vmm_dst, vmm_dst);
|
||||
if (isa == cpu::avx512_common) {
|
||||
@ -596,7 +614,7 @@ private:
|
||||
}
|
||||
|
||||
inline void store_scalar(const Xbyak::Address &op, Xmm xmm_dst, memory::data_type dst_dt) {
|
||||
if (dst_dt != data_type::f32) {
|
||||
if (!isFloatCompatible(dst_dt)) {
|
||||
uni_vcvtps2dq(xmm_dst, xmm_dst);
|
||||
}
|
||||
|
||||
@ -605,6 +623,10 @@ private:
|
||||
case memory::s32:
|
||||
movss(op, xmm_dst);
|
||||
break;
|
||||
case memory::bf16:
|
||||
uni_vpsrld(xmm_dst, xmm_dst, 16);
|
||||
pextrw(op, xmm_dst, 0x0);
|
||||
break;
|
||||
case memory::s8:
|
||||
uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst);
|
||||
uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst);
|
||||
@ -653,7 +675,7 @@ private:
|
||||
|| quantization_injectors[quantization_inj_idx] == nullptr)
|
||||
assert(!"Invalid quantization injectors.");
|
||||
bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize;
|
||||
bool do_rounding = do_dequantization || dst_dt == memory::f32 || i != p.len_ - 1;
|
||||
bool do_rounding = do_dequantization || isFloatCompatible(dst_dt) || i != p.len_ - 1;
|
||||
|
||||
int s_idx = vmm_val.getIdx();
|
||||
|
||||
@ -747,9 +769,7 @@ void MKLDNNNormalizeNode::initSupportedPrimitiveDescriptors() {
|
||||
setPostOps(attr, true);
|
||||
|
||||
Precision inputPrecision = getCnnLayer()->insData[0].lock()->getPrecision();
|
||||
inputPrecision = inputPrecision == Precision::BF16 ? Precision(Precision::FP32) : inputPrecision;
|
||||
Precision outputPrecision = getCnnLayer()->outData[0]->getPrecision();
|
||||
outputPrecision = outputPrecision == Precision::BF16 ? Precision(Precision::FP32) : outputPrecision;
|
||||
|
||||
if (!fusedWith.empty()) {
|
||||
auto lastFusedLayer = fusedWith[fusedWith.size() - 1].get()->getCnnLayer();
|
||||
@ -758,6 +778,13 @@ void MKLDNNNormalizeNode::initSupportedPrimitiveDescriptors() {
|
||||
}
|
||||
}
|
||||
|
||||
if (inputPrecision == Precision::BF16 || outputPrecision == Precision::BF16) {
|
||||
if (!mayiuse(avx512_core_bf16))
|
||||
inputPrecision = outputPrecision = Precision::FP32;
|
||||
else
|
||||
inputPrecision = outputPrecision = Precision::BF16;
|
||||
}
|
||||
|
||||
auto isOneOf = [&](InferenceEngine::Precision precision, std::vector<InferenceEngine::Precision> precisions) {
|
||||
for (auto p : precisions) {
|
||||
if (precision == p) {
|
||||
@ -766,10 +793,10 @@ void MKLDNNNormalizeNode::initSupportedPrimitiveDescriptors() {
|
||||
}
|
||||
return false;
|
||||
};
|
||||
if (!isOneOf(inputPrecision, {Precision::FP32, Precision::I8, Precision::U8})) {
|
||||
if (!isOneOf(inputPrecision, {Precision::FP32, Precision::BF16, Precision::I8, Precision::U8})) {
|
||||
THROW_IE_EXCEPTION << "Unsupported input precision. " << getName();
|
||||
}
|
||||
if (!isOneOf(outputPrecision, {Precision::FP32, Precision::I8, Precision::U8})) {
|
||||
if (!isOneOf(outputPrecision, {Precision::FP32, Precision::BF16, Precision::I8, Precision::U8})) {
|
||||
THROW_IE_EXCEPTION << "Unsupported output precision. " << getName();
|
||||
}
|
||||
if (!isOneOf(weights_prec, {Precision::FP32, Precision::BF16})) {
|
||||
@ -918,6 +945,8 @@ void MKLDNNNormalizeNode::execute(mkldnn::stream strm) {
|
||||
} else if (input_prec == Precision::FP32) {
|
||||
auto src_data = reinterpret_cast<const float *>(src_ptr);
|
||||
normalize_function<float, uint8_t>(src_data, dst_data, dims);
|
||||
} else {
|
||||
THROW_IE_EXCEPTION << "Unsupported input precision: " << input_prec.name();
|
||||
}
|
||||
} else if (output_prec == Precision::I8) {
|
||||
auto dst_data = reinterpret_cast<int8_t *>(dst_ptr);
|
||||
@ -930,6 +959,8 @@ void MKLDNNNormalizeNode::execute(mkldnn::stream strm) {
|
||||
} else if (input_prec == Precision::FP32) {
|
||||
auto src_data = reinterpret_cast<const float *>(src_ptr);
|
||||
normalize_function<float, int8_t>(src_data, dst_data, dims);
|
||||
} else {
|
||||
THROW_IE_EXCEPTION << "Unsupported input precision: " << input_prec.name();
|
||||
}
|
||||
} else if (output_prec == Precision::FP32) {
|
||||
auto dst_data = reinterpret_cast<float *>(dst_ptr);
|
||||
@ -942,7 +973,15 @@ void MKLDNNNormalizeNode::execute(mkldnn::stream strm) {
|
||||
} else if (input_prec == Precision::FP32) {
|
||||
auto src_data = reinterpret_cast<const float *>(src_ptr);
|
||||
normalize_function<float, float>(src_data, dst_data, dims);
|
||||
} else {
|
||||
THROW_IE_EXCEPTION << "Unsupported input precision: " << input_prec.name();
|
||||
}
|
||||
} else if (output_prec == Precision::BF16) {
|
||||
auto dst_data = reinterpret_cast<bfloat16_t*>(dst_ptr);
|
||||
auto src_data = reinterpret_cast<const bfloat16_t*>(src_ptr);
|
||||
normalize_function<bfloat16_t, bfloat16_t>(src_data, dst_data, dims);
|
||||
} else {
|
||||
THROW_IE_EXCEPTION << "Unsupported output precision: " << output_prec.name();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -91,13 +91,7 @@ void MKLDNNPoolingNode::getSupportedDescriptors() {
|
||||
MKLDNNMemoryDesc in_candidate{parentDims, inputDataType, parentDims.ndims() == 5 ? memory::format::ndhwc : memory::format::nhwc};
|
||||
MKLDNNMemoryDesc out_candidate{childDims, outputDataType, parentDims.ndims() == 5 ? memory::format::ndhwc : memory::format::nhwc};
|
||||
createDescriptor({ in_candidate }, { out_candidate });
|
||||
} else if ((parentDims.ndims() == 4 || parentDims.ndims() == 5) && (inputDataType == memory::bf16 || outputDataType == memory::bf16)) {
|
||||
MKLDNNMemoryDesc in_candidate{ parentDims, memory::bf16, parentDims.ndims() == 5 ? memory::format::nCdhw16c : memory::format::nChw16c};
|
||||
MKLDNNMemoryDesc out_candidate{ childDims, memory::bf16, parentDims.ndims() == 5 ? memory::format::nCdhw16c : memory::format::nChw16c};
|
||||
createDescriptor({ in_candidate }, { out_candidate });
|
||||
} else if ((parentDims.ndims() == 4 || parentDims.ndims() == 5) && parentDims[1] == 1) {
|
||||
inputDataType = memory::f32;
|
||||
outputDataType = memory::f32;
|
||||
// WA. We should force planar layout since it provides better performance
|
||||
MKLDNNMemoryDesc in_candidate{parentDims, inputDataType, parentDims.ndims() == 5 ? memory::format::ncdhw : memory::format::nchw};
|
||||
MKLDNNMemoryDesc out_candidate{childDims, outputDataType, parentDims.ndims() == 5 ? memory::format::ncdhw : memory::format::nchw};
|
||||
|
@ -12,6 +12,7 @@
|
||||
#include <set>
|
||||
#include <mkldnn_types.h>
|
||||
#include <mkldnn_extension_utils.h>
|
||||
#include "utils/bfloat16.hpp"
|
||||
#include "ie_parallel.hpp"
|
||||
#include <algorithm>
|
||||
|
||||
@ -64,6 +65,11 @@ using namespace Xbyak;
|
||||
#define GET_PTR_NCD_BASE_PTR_N_BLK const uint8_t *in_ptr_ncd = in_ptr_n + src_data_size * (icb * ID + id) * IH * IW * blk_size; \
|
||||
uint8_t *out_ptr_ncd = out_ptr_n + dst_data_size * (ocb * OD + od) * OH * OW * blk_size;
|
||||
|
||||
// some utility functions
|
||||
static inline bool isFloatCompatible(memory::data_type type) {
|
||||
return memory::f32 == type || memory::bf16 == type;
|
||||
}
|
||||
|
||||
template <cpu_isa_t isa>
|
||||
struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_generator {
|
||||
DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_reduce_kernel_f32)
|
||||
@ -278,13 +284,13 @@ private:
|
||||
uni_vpxor(vmm_dst, vmm_dst, vmm_dst);
|
||||
break;
|
||||
case Reduce::Max:
|
||||
if (jcp_.dst_dt == memory::f32)
|
||||
if (isFloatCompatible(jcp_.dst_dt))
|
||||
uni_vmovups(vmm_dst, table_val(2));
|
||||
else
|
||||
uni_vmovups(vmm_dst, table_val(4));
|
||||
break;
|
||||
case Reduce::Min:
|
||||
if (jcp_.dst_dt == memory::f32)
|
||||
if (isFloatCompatible(jcp_.dst_dt))
|
||||
uni_vmovups(vmm_dst, table_val(3));
|
||||
else
|
||||
uni_vmovups(vmm_dst, table_val(5));
|
||||
@ -540,6 +546,10 @@ private:
|
||||
case memory::s32:
|
||||
uni_vmovups(vmm_src, op);
|
||||
break;
|
||||
case memory::bf16:
|
||||
uni_vpmovzxwd(vmm_src, op);
|
||||
uni_vpslld(vmm_src, vmm_src, 16);
|
||||
break;
|
||||
case memory::s8:
|
||||
uni_vpmovsxbd(vmm_src, op);
|
||||
break;
|
||||
@ -550,7 +560,7 @@ private:
|
||||
assert(!"unknown src_dt");
|
||||
}
|
||||
|
||||
if (src_dt != memory::f32)
|
||||
if (!isFloatCompatible(src_dt))
|
||||
uni_vcvtdq2ps(vmm_src, vmm_src);
|
||||
}
|
||||
|
||||
@ -560,6 +570,10 @@ private:
|
||||
case memory::s32:
|
||||
movss(xmm_src, op);
|
||||
break;
|
||||
case memory::bf16:
|
||||
pinsrw(xmm_src, op, 0x0);
|
||||
uni_vpslld(xmm_src, xmm_src, 16);
|
||||
break;
|
||||
case memory::s8:
|
||||
movsx(reg_tmp_32, op);
|
||||
movq(xmm_src, reg_tmp_64);
|
||||
@ -572,7 +586,7 @@ private:
|
||||
assert(!"unknown src_dt");
|
||||
}
|
||||
|
||||
if (src_dt != data_type::f32) {
|
||||
if (!isFloatCompatible(src_dt)) {
|
||||
uni_vcvtdq2ps(xmm_src, xmm_src);
|
||||
}
|
||||
}
|
||||
@ -581,7 +595,7 @@ private:
|
||||
Xmm xmm_dst = Xmm(vmm_dst.getIdx());
|
||||
Ymm ymm_dst = Ymm(vmm_dst.getIdx());
|
||||
|
||||
if (dst_dt != memory::f32) {
|
||||
if (!isFloatCompatible(dst_dt)) {
|
||||
uni_vcvtps2dq(vmm_dst, vmm_dst);
|
||||
}
|
||||
|
||||
@ -590,6 +604,10 @@ private:
|
||||
case memory::s32:
|
||||
uni_vmovups(op, vmm_dst);
|
||||
break;
|
||||
case memory::bf16:
|
||||
vcvtneps2bf16(ymm_dst, vmm_dst);
|
||||
uni_vmovups(op, ymm_dst);
|
||||
break;
|
||||
case memory::s8:
|
||||
if (isa == avx512_common) {
|
||||
vmaxps(vmm_dst, vmm_zero, vmm_dst);
|
||||
@ -625,7 +643,7 @@ private:
|
||||
}
|
||||
|
||||
inline void store_scalar(const Xbyak::Address &op, Xmm xmm_dst, memory::data_type dst_dt) {
|
||||
if (dst_dt != memory::f32) {
|
||||
if (!isFloatCompatible(dst_dt)) {
|
||||
uni_vcvtps2dq(xmm_dst, xmm_dst);
|
||||
}
|
||||
|
||||
@ -634,6 +652,10 @@ private:
|
||||
case memory::s32:
|
||||
movss(op, xmm_dst);
|
||||
break;
|
||||
case memory::bf16:
|
||||
uni_vpsrld(xmm_dst, xmm_dst, 16);
|
||||
pextrw(op, xmm_dst, 0x0);
|
||||
break;
|
||||
case memory::s8:
|
||||
uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst);
|
||||
uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst);
|
||||
@ -680,9 +702,10 @@ private:
|
||||
horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2,3,4),...
|
||||
switch (dst_dt) {
|
||||
case memory::f32:
|
||||
movss(xmm_aux3, ptr[reg_dst]);
|
||||
case memory::bf16:
|
||||
load_scalar(xmm_aux3, ptr[reg_dst], dst_dt);
|
||||
horiz_ps(xmm_dst, xmm_aux3);
|
||||
movss(ptr[reg_dst], xmm_dst);
|
||||
store_scalar(ptr[reg_dst], xmm_dst, dst_dt);
|
||||
break;
|
||||
case memory::s32:
|
||||
movss(xmm_aux3, ptr[reg_dst]);
|
||||
@ -981,6 +1004,10 @@ private:
|
||||
case memory::s32:
|
||||
uni_vmovups(vmm_src, op);
|
||||
break;
|
||||
case memory::bf16:
|
||||
uni_vpmovzxwd(vmm_src, op);
|
||||
uni_vpslld(vmm_src, vmm_src, 16);
|
||||
break;
|
||||
case memory::s8:
|
||||
uni_vpmovsxbd(vmm_src, op);
|
||||
break;
|
||||
@ -991,7 +1018,7 @@ private:
|
||||
assert(!"unknown src_dt");
|
||||
}
|
||||
|
||||
if (src_dt != memory::f32)
|
||||
if (!isFloatCompatible(src_dt))
|
||||
uni_vcvtdq2ps(vmm_src, vmm_src);
|
||||
}
|
||||
|
||||
@ -1001,6 +1028,10 @@ private:
|
||||
case memory::s32:
|
||||
movss(xmm_src, op);
|
||||
break;
|
||||
case memory::bf16:
|
||||
pinsrw(xmm_src, op, 0x0);
|
||||
uni_vpslld(xmm_src, xmm_src, 16);
|
||||
break;
|
||||
case memory::s8:
|
||||
movsx(reg_tmp_32, op);
|
||||
movq(xmm_src, reg_tmp_64);
|
||||
@ -1013,7 +1044,7 @@ private:
|
||||
assert(!"unknown src_dt");
|
||||
}
|
||||
|
||||
if (src_dt != data_type::f32) {
|
||||
if (!isFloatCompatible(src_dt)) {
|
||||
uni_vcvtdq2ps(xmm_src, xmm_src);
|
||||
}
|
||||
}
|
||||
@ -1022,7 +1053,7 @@ private:
|
||||
Xmm xmm_dst = Xmm(vmm_dst.getIdx());
|
||||
Ymm ymm_dst = Ymm(vmm_dst.getIdx());
|
||||
|
||||
if (dst_dt != memory::f32) {
|
||||
if (!isFloatCompatible(dst_dt)) {
|
||||
uni_vcvtps2dq(vmm_dst, vmm_dst);
|
||||
}
|
||||
|
||||
@ -1031,6 +1062,10 @@ private:
|
||||
case memory::s32:
|
||||
uni_vmovups(op, vmm_dst);
|
||||
break;
|
||||
case memory::bf16:
|
||||
vcvtneps2bf16(ymm_dst, vmm_dst);
|
||||
uni_vmovups(op, ymm_dst);
|
||||
break;
|
||||
case memory::s8:
|
||||
if (isa == avx512_common) {
|
||||
vmaxps(vmm_dst, vmm_zero, vmm_dst);
|
||||
@ -1066,7 +1101,7 @@ private:
|
||||
}
|
||||
|
||||
inline void store_scalar(const Xbyak::Address &op, Xmm xmm_dst, memory::data_type dst_dt) {
|
||||
if (dst_dt != memory::f32) {
|
||||
if (!isFloatCompatible(dst_dt)) {
|
||||
uni_vcvtps2dq(xmm_dst, xmm_dst);
|
||||
}
|
||||
|
||||
@ -1075,6 +1110,10 @@ private:
|
||||
case memory::s32:
|
||||
movss(op, xmm_dst);
|
||||
break;
|
||||
case memory::bf16:
|
||||
uni_vpsrld(xmm_dst, xmm_dst, 16);
|
||||
pextrw(op, xmm_dst, 0x0);
|
||||
break;
|
||||
case memory::s8:
|
||||
uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst);
|
||||
uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst);
|
||||
@ -1123,6 +1162,10 @@ private:
|
||||
case memory::f32:
|
||||
movss(ptr[reg_dst], xmm_dst);
|
||||
break;
|
||||
case memory::bf16:
|
||||
uni_vpsrld(xmm_dst, xmm_dst, 16);
|
||||
pextrw(ptr[reg_dst], xmm_dst, 0x0);
|
||||
break;
|
||||
case memory::s32:
|
||||
uni_vcvtps2dq(xmm_dst, xmm_dst);
|
||||
movss(ptr[reg_dst], xmm_dst);
|
||||
@ -1173,9 +1216,10 @@ private:
|
||||
horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2,3,4),...
|
||||
switch (dst_dt) {
|
||||
case memory::f32:
|
||||
movss(xmm_aux3, ptr[reg_dst]);
|
||||
case memory::bf16:
|
||||
load_scalar(xmm_aux3, ptr[reg_dst], dst_dt);
|
||||
horiz_ps(xmm_dst, xmm_aux3);
|
||||
movss(ptr[reg_dst], xmm_dst);
|
||||
store_scalar(ptr[reg_dst], xmm_dst, dst_dt);
|
||||
break;
|
||||
case memory::s32:
|
||||
movss(xmm_aux3, ptr[reg_dst]);
|
||||
@ -1292,11 +1336,33 @@ void MKLDNNReduceNode::initSupportedPrimitiveDescriptors() {
|
||||
if (!supportedPrimitiveDescriptors.empty())
|
||||
return;
|
||||
|
||||
static const Precision supportedPrecisions[] = {
|
||||
Precision::FP32,
|
||||
Precision::BF16,
|
||||
Precision::I32,
|
||||
Precision::I8,
|
||||
Precision::U8
|
||||
};
|
||||
|
||||
Precision inputPrecision = getCnnLayer()->insData[REDUCE_DATA].lock()->getPrecision();
|
||||
Precision outputPrecision = getCnnLayer()->outData[0]->getPrecision();
|
||||
|
||||
if (inputPrecision == Precision::BF16) inputPrecision = Precision::FP32;
|
||||
if (outputPrecision == Precision::BF16) outputPrecision = Precision::FP32;
|
||||
jit_mode = (mayiuse(cpu::sse42)) && getParentEdgeAt(REDUCE_DATA)->getDims().ndims() <= 5 &&
|
||||
std::find(std::begin(supportedPrecisions), std::end(supportedPrecisions), inputPrecision) != std::end(supportedPrecisions) &&
|
||||
std::find(std::begin(supportedPrecisions), std::end(supportedPrecisions), outputPrecision) != std::end(supportedPrecisions);
|
||||
|
||||
if (jit_mode) {
|
||||
// Since in jit mode we use the output memory as an intermediate accumulator for certain reduce modes, we can't use BF16 output precision due to
|
||||
// the possible accuracy loss. Therefore, for such mods, we will change the output precision to FP32.
|
||||
if (Precision::BF16 == outputPrecision) {
|
||||
if (!mayiuse(avx512_core_bf16)) {
|
||||
outputPrecision = Precision::FP32;
|
||||
} else if (reduceMode != Reduce::And && reduceMode != Reduce::Or &&
|
||||
reduceMode != Reduce::Max && reduceMode != Reduce::Min) {
|
||||
outputPrecision = Precision::FP32;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(inputPrecision);
|
||||
auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(outputPrecision);
|
||||
@ -1317,37 +1383,42 @@ void MKLDNNReduceNode::initSupportedPrimitiveDescriptors() {
|
||||
config.inConfs[REDUCE_INDEXES].inPlace = -1;
|
||||
config.outConfs[0].inPlace = -1;
|
||||
|
||||
auto pushDesc = [&](memory::format inFormat, memory::format outFormat, memory::data_type inDataType, memory::data_type outDataType) {
|
||||
auto pushDesc = [&](memory::format inFormat, memory::format outFormat, memory::data_type inDataType,
|
||||
memory::data_type outDataType, impl_desc_type impl_type) {
|
||||
config.inConfs[REDUCE_DATA].desc = MKLDNNMemoryDesc(getParentEdgeAt(REDUCE_DATA)->getDims(), inDataType, inFormat);
|
||||
config.inConfs[REDUCE_INDEXES].desc = MKLDNNMemoryDesc(getParentEdgeAt(REDUCE_INDEXES)->getDims(), memory::s32, memory::x);
|
||||
config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outDataType, outFormat);
|
||||
supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, outFormat});
|
||||
supportedPrimitiveDescriptors.push_back({config, impl_type, outFormat});
|
||||
};
|
||||
|
||||
jit_mode = (mayiuse(cpu::sse42)) && getParentEdgeAt(REDUCE_DATA)->getDims().ndims() <= 5 &&
|
||||
(inputPrecision == Precision::FP32 || inputPrecision == Precision::I32 || inputPrecision == Precision::U8 || inputPrecision == Precision::I8) &&
|
||||
(outputPrecision == Precision::FP32 || outputPrecision == Precision::I32 || outputPrecision == Precision::U8 || outputPrecision == Precision::I8);
|
||||
if (jit_mode) {
|
||||
impl_desc_type impl_type = impl_desc_type::jit_sse42;
|
||||
if (mayiuse(cpu::avx512_common)) {
|
||||
impl_type = impl_desc_type::jit_avx512;
|
||||
} else if (mayiuse(cpu::avx2)) {
|
||||
impl_type = impl_desc_type::jit_avx2;
|
||||
}
|
||||
|
||||
pushDesc(MKLDNNMemory::GetPlainFormat(memory::dims(getParentEdgeAt(REDUCE_DATA)->getDims().ndims())),
|
||||
MKLDNNMemory::GetPlainFormat(memory::dims(getChildEdgeAt(0)->getDims().ndims())), inputDataType, outputDataType);
|
||||
MKLDNNMemory::GetPlainFormat(memory::dims(getChildEdgeAt(0)->getDims().ndims())), inputDataType, outputDataType, impl_type);
|
||||
if (keep_dims) {
|
||||
if (getParentEdgeAt(REDUCE_DATA)->getDims().ndims() == 4 && getParentEdgeAt(REDUCE_DATA)->getDims().ToSizeVector()[1] > 1) {
|
||||
if (mayiuse(cpu::avx512_common)) {
|
||||
pushDesc(memory::nChw16c, memory::nChw16c, inputDataType, outputDataType);
|
||||
pushDesc(memory::nChw16c, memory::nChw16c, inputDataType, outputDataType, impl_type);
|
||||
} else if (mayiuse(cpu::avx2) || mayiuse(cpu::sse42)) {
|
||||
pushDesc(memory::nChw8c, memory::nChw8c, inputDataType, outputDataType);
|
||||
pushDesc(memory::nChw8c, memory::nChw8c, inputDataType, outputDataType, impl_type);
|
||||
}
|
||||
} else if (getParentEdgeAt(REDUCE_DATA)->getDims().ndims() == 5 && getParentEdgeAt(REDUCE_DATA)->getDims().ToSizeVector()[1] > 1) {
|
||||
if (mayiuse(cpu::avx512_common)) {
|
||||
pushDesc(memory::nCdhw16c, memory::nCdhw16c, inputDataType, outputDataType);
|
||||
pushDesc(memory::nCdhw16c, memory::nCdhw16c, inputDataType, outputDataType, impl_type);
|
||||
} else if (mayiuse(cpu::avx2) || mayiuse(cpu::sse42)) {
|
||||
pushDesc(memory::nCdhw8c, memory::nCdhw8c, inputDataType, outputDataType);
|
||||
pushDesc(memory::nCdhw8c, memory::nCdhw8c, inputDataType, outputDataType, impl_type);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
pushDesc(MKLDNNMemory::GetPlainFormat(memory::dims(getParentEdgeAt(REDUCE_DATA)->getDims().ndims())),
|
||||
MKLDNNMemory::GetPlainFormat(memory::dims(getChildEdgeAt(0)->getDims().ndims())), memory::f32, memory::f32);
|
||||
MKLDNNMemory::GetPlainFormat(memory::dims(getChildEdgeAt(0)->getDims().ndims())), memory::f32, memory::f32, impl_desc_type::ref);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1714,6 +1785,9 @@ inline void MKLDNNReduceNode::init_dst_data(uint8_t *out_ptr, size_t dst_size) {
|
||||
} else if (output_prec == Precision::I32) {
|
||||
auto out_p = reinterpret_cast<int32_t *>(out_ptr);
|
||||
parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast<int32_t>(1); });
|
||||
} else if (output_prec == Precision::BF16) {
|
||||
auto out_p = reinterpret_cast<bfloat16_t*>(out_ptr);
|
||||
parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast<bfloat16_t>(1); });
|
||||
} else if (output_prec == Precision::U8) {
|
||||
auto out_p = reinterpret_cast<uint8_t *>(out_ptr);
|
||||
parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast<uint8_t>(1); });
|
||||
@ -1729,6 +1803,9 @@ inline void MKLDNNReduceNode::init_dst_data(uint8_t *out_ptr, size_t dst_size) {
|
||||
} else if (output_prec == Precision::I32) {
|
||||
auto out_p = reinterpret_cast<int32_t *>(out_ptr);
|
||||
parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<int32_t>::min(); });
|
||||
} else if (output_prec == Precision::BF16) {
|
||||
auto out_p = reinterpret_cast<bfloat16_t*>(out_ptr);
|
||||
parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<bfloat16_t>::min(); });
|
||||
} else if (output_prec == Precision::U8) {
|
||||
auto out_p = reinterpret_cast<uint8_t *>(out_ptr);
|
||||
parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<uint8_t>::min(); });
|
||||
@ -1744,6 +1821,9 @@ inline void MKLDNNReduceNode::init_dst_data(uint8_t *out_ptr, size_t dst_size) {
|
||||
} else if (output_prec == Precision::I32) {
|
||||
auto out_p = reinterpret_cast<int32_t *>(out_ptr);
|
||||
parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<int32_t>::max(); });
|
||||
} else if (output_prec == Precision::BF16) {
|
||||
auto out_p = reinterpret_cast<bfloat16_t*>(out_ptr);
|
||||
parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<bfloat16_t>::max(); });
|
||||
} else if (output_prec == Precision::U8) {
|
||||
auto out_p = reinterpret_cast<uint8_t *>(out_ptr);
|
||||
parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<uint8_t>::max(); });
|
||||
|
@ -12,6 +12,7 @@
|
||||
#include <vector>
|
||||
#include <mkldnn_types.h>
|
||||
#include <mkldnn_extension_utils.h>
|
||||
#include "utils/bfloat16.hpp"
|
||||
#include <legacy/ie_layers_internal.hpp>
|
||||
#include "ie_parallel.hpp"
|
||||
#include <algorithm>
|
||||
@ -33,6 +34,14 @@ using namespace Xbyak;
|
||||
|
||||
#define GET_OFF(field) offsetof(jit_resample_call_args, field)
|
||||
|
||||
static inline bool isFloatCompatible(Precision prc) {
|
||||
return Precision::FP32 == prc || Precision::BF16 == prc;
|
||||
}
|
||||
|
||||
static inline bool isFloatCompatible(memory::data_type type) {
|
||||
return memory::f32 == type || memory::bf16 == type;
|
||||
}
|
||||
|
||||
template <cpu_isa_t isa>
|
||||
struct jit_uni_resample_nearest_kernel_f32 : public jit_uni_resample_nearest_kernel, public jit_generator {
|
||||
DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_resample_nearest_kernel_f32)
|
||||
@ -73,7 +82,7 @@ struct jit_uni_resample_nearest_kernel_f32 : public jit_uni_resample_nearest_ker
|
||||
if (isa == cpu::avx512_common)
|
||||
uni_vpxor(vmm_zero, vmm_zero, vmm_zero);
|
||||
|
||||
int blk_size = vlen / sizeof(float);
|
||||
int blk_size = jcp_.src_dt == memory::bf16 ? 16 : (vlen / sizeof(float));
|
||||
if (isa == cpu::sse42)
|
||||
blk_size *= 2;
|
||||
|
||||
@ -197,11 +206,15 @@ private:
|
||||
case memory::u8:
|
||||
uni_vpmovzxbd(vmm_src, op);
|
||||
break;
|
||||
case memory::bf16:
|
||||
uni_vpmovzxwd(vmm_src, op);
|
||||
uni_vpslld(vmm_src, vmm_src, 16);
|
||||
break;
|
||||
default:
|
||||
assert(!"unknown dst_dt");
|
||||
}
|
||||
|
||||
if (src_dt != memory::f32)
|
||||
if (!isFloatCompatible(src_dt))
|
||||
uni_vcvtdq2ps(vmm_src, vmm_src);
|
||||
}
|
||||
|
||||
@ -211,6 +224,9 @@ private:
|
||||
|
||||
if (dst_dt == memory::f32) {
|
||||
uni_vmovups(op, vmm_dst);
|
||||
} else if (dst_dt == memory::bf16) {
|
||||
vcvtneps2bf16(ymm_dst, vmm_dst);
|
||||
vmovdqu16(op, ymm_dst);
|
||||
} else if (dst_dt == memory::u8) {
|
||||
uni_vcvtps2dq(vmm_dst, vmm_dst);
|
||||
if (isa == cpu::avx512_common) {
|
||||
@ -262,8 +278,7 @@ private:
|
||||
depthwise_inj_idx++;
|
||||
} else if (post_op.is_quantization()) {
|
||||
bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize;
|
||||
bool do_rounding = do_dequantization || dst_dt == memory::f32 || i != p.len_ - 1;
|
||||
|
||||
bool do_rounding = do_dequantization || isFloatCompatible(dst_dt) || i != p.len_ - 1;
|
||||
int s_idx = vmm_val.getIdx();
|
||||
|
||||
quantization_injectors[quantization_inj_idx]->init_crop_ptrs(reg_oc_off);
|
||||
@ -320,12 +335,11 @@ void MKLDNNResampleNode::initSupportedPrimitiveDescriptors() {
|
||||
}
|
||||
}
|
||||
|
||||
if (inputPrecision == Precision::BF16) {
|
||||
inputPrecision = Precision::FP32;
|
||||
}
|
||||
|
||||
if (outputPrecision == Precision::BF16) {
|
||||
outputPrecision = Precision::FP32;
|
||||
if (inputPrecision == Precision::BF16 || outputPrecision == Precision::BF16) {
|
||||
if (!mayiuse(avx512_core_bf16))
|
||||
inputPrecision = outputPrecision = Precision::FP32;
|
||||
else
|
||||
inputPrecision = outputPrecision = Precision::BF16;
|
||||
}
|
||||
|
||||
auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(inputPrecision);
|
||||
@ -358,7 +372,7 @@ void MKLDNNResampleNode::initSupportedPrimitiveDescriptors() {
|
||||
pushDesc(memory::ndhwc);
|
||||
}
|
||||
|
||||
if (inputPrecision == Precision::FP32 && outputPrecision == Precision::FP32) {
|
||||
if (isFloatCompatible(inputPrecision) && isFloatCompatible(outputPrecision)) {
|
||||
if (getParentEdgeAt(0)->getDims().ndims() == 4) {
|
||||
if (mayiuse(cpu::avx512_common)) {
|
||||
pushDesc(memory::nChw16c);
|
||||
@ -456,9 +470,6 @@ void MKLDNNResampleNode::execute(mkldnn::stream strm) {
|
||||
|
||||
Layout layout = getParentEdgeAt(0)->getDesc().getLayout();
|
||||
|
||||
const auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
|
||||
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
|
||||
|
||||
SizeVector src_dim = getParentEdgeAt(0)->getDesc().getDims();
|
||||
SizeVector dst_dim = getChildEdgeAt(0)->getDesc().getDims();
|
||||
|
||||
@ -479,7 +490,17 @@ void MKLDNNResampleNode::execute(mkldnn::stream strm) {
|
||||
|
||||
if (type == "caffe.ResampleParameter.NEAREST") {
|
||||
if (layout == NCHW || layout == NCDHW) {
|
||||
NearestNeighbor_PLN(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW);
|
||||
if (output_prec == Precision::FP32) {
|
||||
auto src_data = reinterpret_cast<const float*>(srcMemPtr->GetData());
|
||||
auto dst_data = reinterpret_cast<float*>(dstMemPtr->GetData());
|
||||
NearestNeighbor_PLN<float, float>(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW);
|
||||
} else if (output_prec == Precision::BF16) {
|
||||
auto src_data = reinterpret_cast<const bfloat16_t*>(srcMemPtr->GetData());
|
||||
auto dst_data = reinterpret_cast<bfloat16_t*>(dstMemPtr->GetData());
|
||||
NearestNeighbor_PLN<bfloat16_t, bfloat16_t>(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW);
|
||||
} else {
|
||||
THROW_IE_EXCEPTION << "Unsupported output precision: " << output_prec.name();
|
||||
}
|
||||
} else {
|
||||
if (output_prec == Precision::U8) {
|
||||
auto dst_data = reinterpret_cast<uint8_t *>(dstMemPtr->GetData());
|
||||
@ -492,6 +513,8 @@ void MKLDNNResampleNode::execute(mkldnn::stream strm) {
|
||||
} else if (input_prec == Precision::FP32) {
|
||||
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
|
||||
NearestNeighbor_BLK<float, uint8_t>(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW);
|
||||
} else {
|
||||
THROW_IE_EXCEPTION << "Unsupported output precision: " << output_prec.name();
|
||||
}
|
||||
} else if (output_prec == Precision::I8) {
|
||||
auto dst_data = reinterpret_cast<int8_t *>(dstMemPtr->GetData());
|
||||
@ -504,6 +527,8 @@ void MKLDNNResampleNode::execute(mkldnn::stream strm) {
|
||||
} else if (input_prec == Precision::FP32) {
|
||||
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
|
||||
NearestNeighbor_BLK<float, int8_t>(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW);
|
||||
} else {
|
||||
THROW_IE_EXCEPTION << "Unsupported output precision: " << output_prec.name();
|
||||
}
|
||||
} else if (output_prec == Precision::FP32) {
|
||||
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
|
||||
@ -516,7 +541,15 @@ void MKLDNNResampleNode::execute(mkldnn::stream strm) {
|
||||
} else if (input_prec == Precision::FP32) {
|
||||
auto src_data = reinterpret_cast<float *>(srcMemPtr->GetData());
|
||||
NearestNeighbor_BLK<float, float>(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW);
|
||||
} else {
|
||||
THROW_IE_EXCEPTION << "Unsupported output precision: " << output_prec.name();
|
||||
}
|
||||
} else if (output_prec == Precision::BF16) {
|
||||
auto src_data = reinterpret_cast<const bfloat16_t*>(srcMemPtr->GetData());
|
||||
auto dst_data = reinterpret_cast<bfloat16_t*>(dstMemPtr->GetData());
|
||||
NearestNeighbor_BLK<bfloat16_t, bfloat16_t>(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW);
|
||||
} else {
|
||||
THROW_IE_EXCEPTION << "Unsupported output precision: " << output_prec.name();
|
||||
}
|
||||
}
|
||||
} else if (type == "caffe.ResampleParameter.LINEAR") {
|
||||
@ -535,12 +568,22 @@ void MKLDNNResampleNode::execute(mkldnn::stream strm) {
|
||||
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
|
||||
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
|
||||
LinearInterpolation<float, float>(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW, kernel_width, isDownsample && antialias);
|
||||
} else if (input_prec == Precision::BF16) {
|
||||
auto src_data = reinterpret_cast<const bfloat16_t*>(srcMemPtr->GetData());
|
||||
auto dst_data = reinterpret_cast<bfloat16_t*>(dstMemPtr->GetData());
|
||||
LinearInterpolation<bfloat16_t, bfloat16_t>(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW, kernel_width,
|
||||
isDownsample && antialias);
|
||||
} else {
|
||||
THROW_IE_EXCEPTION << "Unsupported input precision: " << input_prec.name();
|
||||
}
|
||||
} else {
|
||||
THROW_IE_EXCEPTION << "Unsupported resample parameter type: " << type;
|
||||
}
|
||||
}
|
||||
|
||||
// f32 and no fused, f32->input is f32, no fuse->output is f32
|
||||
void MKLDNNResampleNode::NearestNeighbor_PLN(const float *in_ptr_, float *out_ptr_, int B, int C, int ID, int IH, int IW,
|
||||
template <typename in_data_t, typename out_data_t>
|
||||
void MKLDNNResampleNode::NearestNeighbor_PLN(const in_data_t *in_ptr_, out_data_t *out_ptr_, int B, int C, int ID, int IH, int IW,
|
||||
float fx, float fy, float fz, int OD, int OH, int OW) {
|
||||
std::vector<int> index_buffer(OD * OH * OW);
|
||||
for (int oz = 0; oz < OD; oz++) {
|
||||
@ -560,8 +603,8 @@ void MKLDNNResampleNode::NearestNeighbor_PLN(const float *in_ptr_, float *out_pt
|
||||
}
|
||||
if (resample_nearest_kernel) {
|
||||
parallel_for2d(B, C, [&](size_t b, size_t c) {
|
||||
const float *in_ptr = in_ptr_ + IW * IH * ID * C * b + IW * IH * ID * c;
|
||||
float *out_ptr = out_ptr_ + OW * OH * OD * C * b + OW * OH * OD * c;
|
||||
const in_data_t *in_ptr = in_ptr_ + IW * IH * ID * C * b + IW * IH * ID * c;
|
||||
out_data_t *out_ptr = out_ptr_ + OW * OH * OD * C * b + OW * OH * OD * c;
|
||||
|
||||
// for OW*OH*OD
|
||||
auto arg = jit_resample_call_args();
|
||||
@ -580,8 +623,8 @@ void MKLDNNResampleNode::NearestNeighbor_PLN(const float *in_ptr_, float *out_pt
|
||||
});
|
||||
} else {
|
||||
parallel_for2d(B, C, [&](size_t b, size_t c) {
|
||||
const float *in_ptr = in_ptr_ + IW * IH * ID * C * b + IW * IH * ID * c;
|
||||
float *out_ptr = out_ptr_ + OW * OH * OD * C * b + OW * OH * OD * c;
|
||||
const in_data_t *in_ptr = in_ptr_ + IW * IH * ID * C * b + IW * IH * ID * c;
|
||||
out_data_t *out_ptr = out_ptr_ + OW * OH * OD * C * b + OW * OH * OD * c;
|
||||
|
||||
for (int i_dst = 0; i_dst < OW * OH * OD; i_dst++) {
|
||||
out_ptr[i_dst] = in_ptr[index_buffer[i_dst]];
|
||||
@ -646,7 +689,7 @@ void MKLDNNResampleNode::NearestNeighbor_BLK(const in_data_t *in_ptr_, out_data_
|
||||
for (int c = tail; c < C; c++) {
|
||||
float dst_value = static_cast<float>(in_ptr_dhw[c]);
|
||||
apply_post_ops_scalar(dst_value, c);
|
||||
if (output_prec == Precision::FP32) {
|
||||
if (isFloatCompatible(output_prec)) {
|
||||
out_ptr_dhw[c] = dst_value;
|
||||
} else if (output_prec == Precision::U8) {
|
||||
out_ptr_dhw[c] = (dst_value >= 0) ? lroundf(dst_value) : 0;
|
||||
@ -671,7 +714,7 @@ void MKLDNNResampleNode::NearestNeighbor_BLK(const in_data_t *in_ptr_, out_data_
|
||||
for (int c = 0; c < C; c++) {
|
||||
float dst_value = static_cast<float>(in_ptr_dhw[c]);
|
||||
apply_post_ops_scalar(dst_value, c);
|
||||
if (output_prec == Precision::FP32) {
|
||||
if (isFloatCompatible(output_prec)) {
|
||||
out_ptr_dhw[c] = dst_value;
|
||||
} else if (output_prec == Precision::U8) {
|
||||
out_ptr_dhw[c] = (dst_value >= 0) ? lroundf(dst_value) : 0;
|
||||
@ -723,7 +766,7 @@ void MKLDNNResampleNode::NearestNeighbor_BLK(const in_data_t *in_ptr_, out_data_
|
||||
for (int blk = 0; blk < blk_size; blk++) {
|
||||
float dst_value = static_cast<float>(in_ptr_cbdhw[blk]);
|
||||
apply_post_ops_scalar(dst_value, cb * blk_size + blk);
|
||||
if (output_prec == Precision::FP32) {
|
||||
if (isFloatCompatible(output_prec)) {
|
||||
out_ptr_cbdhw[blk] = dst_value;
|
||||
} else if (output_prec == Precision::U8) {
|
||||
out_ptr_cbdhw[blk] = (dst_value >= 0) ? lroundf(dst_value) : 0;
|
||||
@ -749,8 +792,8 @@ void MKLDNNResampleNode::LinearInterpolation(const in_data_t *in_ptr_, out_data_
|
||||
float fx, float fy, float fz, int OD, int OH, int OW, int kernel_width, bool antialias) {
|
||||
if (IW == OW && IH == OH && ID == OD) {
|
||||
size_t size = B * C * ID * IH * IW;
|
||||
if (input_prec == Precision::FP32) {
|
||||
size *= sizeof(float);
|
||||
if (isFloatCompatible(input_prec)) {
|
||||
size *= sizeof(in_data_t);
|
||||
}
|
||||
cpu_memcpy(out_ptr_, in_ptr_, size);
|
||||
return;
|
||||
@ -816,7 +859,7 @@ void MKLDNNResampleNode::LinearInterpolation(const in_data_t *in_ptr_, out_data_
|
||||
out_ptr_ncdh[ox] = 0;
|
||||
} else {
|
||||
float dst_value = sum / wsum;
|
||||
if (output_prec == Precision::FP32) {
|
||||
if (isFloatCompatible(output_prec)) {
|
||||
out_ptr_ncdh[ox] = dst_value;
|
||||
} else if (output_prec == Precision::U8) {
|
||||
out_ptr_ncdh[ox] = (dst_value >= 0) ? lroundf(dst_value) : 0;
|
||||
@ -846,7 +889,7 @@ inline void MKLDNNResampleNode::apply_post_ops_scalar(float &dst_value, int inde
|
||||
} else if (post_op.is_quantization()) {
|
||||
bool do_dequantization = post_op.quantization.alg ==
|
||||
alg_kind::quantization_quantize_dequantize;
|
||||
bool do_rounding = do_dequantization || output_prec == Precision::FP32 ||
|
||||
bool do_rounding = do_dequantization || isFloatCompatible(output_prec) ||
|
||||
i != p.len_ - 1;
|
||||
|
||||
auto quant = post_op.quantization;
|
||||
|
@ -78,7 +78,8 @@ public:
|
||||
}
|
||||
|
||||
private:
|
||||
void NearestNeighbor_PLN(const float *in_ptr_, float *out_ptr_, int B, int C, int ID, int IH, int IW,
|
||||
template <typename in_data_t, typename out_data_t>
|
||||
void NearestNeighbor_PLN(const in_data_t *in_ptr_, out_data_t *out_ptr_, int B, int C, int ID, int IH, int IW,
|
||||
float fx, float fy, float fz, int OD, int OH, int OW);
|
||||
template <typename in_data_t, typename out_data_t>
|
||||
void NearestNeighbor_BLK(const in_data_t *in_ptr_, out_data_t *out_ptr_, int B, int C, int ID, int IH, int IW,
|
||||
|
@ -14,21 +14,11 @@
|
||||
#include <legacy/ie_layers_internal.hpp>
|
||||
#include "ie_parallel.hpp"
|
||||
#include <algorithm>
|
||||
|
||||
#include "jit_generator.hpp"
|
||||
#include "jit_uni_eltwise.hpp"
|
||||
#include "jit_uni_depthwise.hpp"
|
||||
#include "jit_uni_quantization.hpp"
|
||||
#include "common/cpu_memcpy.h"
|
||||
|
||||
using namespace mkldnn;
|
||||
using namespace MKLDNNPlugin;
|
||||
using namespace InferenceEngine;
|
||||
using namespace mkldnn::impl;
|
||||
using namespace mkldnn::impl::cpu;
|
||||
using namespace mkldnn::impl::utils;
|
||||
using namespace Xbyak;
|
||||
|
||||
|
||||
MKLDNNScatterUpdateNode::MKLDNNScatterUpdateNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache)
|
||||
: MKLDNNNode(layer, eng, cache), dataSize(0lu), indicesSize(0lu), axisSize(0lu),
|
||||
|
@ -36,6 +36,8 @@ public:
|
||||
|
||||
// check a precision of the input tensor
|
||||
input_precision = layer->insData[0].lock()->getTensorDesc().getPrecision();
|
||||
if (input_precision == Precision::BF16)
|
||||
input_precision = Precision::FP32;
|
||||
if (input_precision != Precision::I32 && input_precision != Precision::FP32) {
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect input precision for the input. Only I32 and FP32 are supported!";
|
||||
}
|
||||
|
@ -27,7 +27,7 @@ public:
|
||||
shift_.push_back(1);
|
||||
shift_.push_back(0);
|
||||
|
||||
addConfig(layer, {DataConfigurator(ConfLayout::PLN)}, {DataConfigurator(ConfLayout::PLN)});
|
||||
addConfig(layer, {DataConfigurator(ConfLayout::PLN, Precision::FP32)}, {DataConfigurator(ConfLayout::PLN, Precision::FP32)});
|
||||
} catch (InferenceEngine::details::InferenceEngineException &ex) {
|
||||
errorMsg = ex.what();
|
||||
}
|
||||
|
@ -116,7 +116,7 @@ public:
|
||||
THROW_IE_EXCEPTION << "Wrong number of variance values. Not less than 1 and more than 4 variance values.";
|
||||
}
|
||||
|
||||
addConfig(layer, {{ConfLayout::ANY, true}, {ConfLayout::ANY, true}}, {{ConfLayout::PLN, true}});
|
||||
addConfig(layer, {{ConfLayout::ANY, true}, {ConfLayout::ANY, true}}, {{ConfLayout::PLN, true, -1, Precision::FP32}});
|
||||
} catch (InferenceEngine::details::InferenceEngineException &ex) {
|
||||
errorMsg = ex.what();
|
||||
}
|
||||
|
@ -32,7 +32,7 @@ public:
|
||||
step_w_ = layer->GetParamAsFloat("step_w", 0);
|
||||
offset_ = layer->GetParamAsFloat("offset");
|
||||
|
||||
addConfig(layer, {{ConfLayout::PLN, true}, {ConfLayout::PLN, true}}, {{ConfLayout::PLN, true}});
|
||||
addConfig(layer, {{ConfLayout::PLN, true}, {ConfLayout::PLN, true}}, {{ConfLayout::PLN, true, -1, Precision::FP32}});
|
||||
} catch (InferenceEngine::details::InferenceEngineException &ex) {
|
||||
errorMsg = ex.what();
|
||||
}
|
||||
|
@ -45,8 +45,8 @@ public:
|
||||
stride_w_ = layer->GetParamAsFloat("stride_x", 0);
|
||||
|
||||
addConfig(layer,
|
||||
{DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::ANY), DataConfigurator(ConfLayout::ANY)},
|
||||
{DataConfigurator(ConfLayout::PLN)});
|
||||
{DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::ANY), DataConfigurator(ConfLayout::ANY)},
|
||||
{DataConfigurator(ConfLayout::PLN, Precision::FP32)});
|
||||
} catch (InferenceEngine::details::InferenceEngineException &ex) {
|
||||
errorMsg = ex.what();
|
||||
}
|
||||
|
@ -119,11 +119,12 @@ public:
|
||||
|
||||
store_prob = layer->outData.size() == 2;
|
||||
if (store_prob) {
|
||||
addConfig(layer, {DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)},
|
||||
{DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)});
|
||||
addConfig(layer, {DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32),
|
||||
DataConfigurator(ConfLayout::PLN, Precision::FP32)},
|
||||
{DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32)});
|
||||
} else {
|
||||
addConfig(layer, {DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)},
|
||||
{DataConfigurator(ConfLayout::PLN)});
|
||||
addConfig(layer, {DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32),
|
||||
DataConfigurator(ConfLayout::PLN, Precision::FP32)}, {DataConfigurator(ConfLayout::PLN, Precision::FP32)});
|
||||
}
|
||||
} catch (const InferenceEngine::details::InferenceEngineException &ex) {
|
||||
errorMsg = ex.what();
|
||||
|
@ -296,9 +296,9 @@ public:
|
||||
|
||||
roi_indices_.resize(post_nms_topn_);
|
||||
addConfig(layer,
|
||||
{DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN),
|
||||
DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)},
|
||||
{DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)});
|
||||
{DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32),
|
||||
DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32)},
|
||||
{DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32)});
|
||||
} catch (InferenceEngine::details::InferenceEngineException &ex) {
|
||||
errorMsg = ex.what();
|
||||
}
|
||||
|
@ -212,8 +212,8 @@ public:
|
||||
int part_w = w * part_size_ / pooled_width_;
|
||||
int class_id = c / channels_each_class;
|
||||
float trans_x = no_trans_ ? 0 :
|
||||
bottom_trans[(((n * num_classes + class_id) * 2) * part_size_ + part_h)
|
||||
* part_size_ + part_w] * trans_std_;
|
||||
bottom_trans[(((n * num_classes + class_id) * 2) * part_size_ + part_h)
|
||||
* part_size_ + part_w] * trans_std_;
|
||||
float trans_y = no_trans_ ? 0 :
|
||||
bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size_ + part_h)
|
||||
* part_size_ + part_w] * trans_std_;
|
||||
|
@ -48,13 +48,12 @@ public:
|
||||
layer->insData[RANGE_LIMIT].lock()->getTensorDesc().getPrecision() == Precision::FP32 &&
|
||||
layer->insData[RANGE_DELTA].lock()->getTensorDesc().getPrecision() == Precision::FP32 &&
|
||||
layer->outData[0]->getTensorDesc().getPrecision() == Precision::FP32)) {
|
||||
THROW_IE_EXCEPTION << layer->name <<
|
||||
" 'Start', 'Limit', 'Delta' input scalars and output tensor should have same precision" <<
|
||||
"and only FP32 and I32 are supported!";
|
||||
addConfig(layer, { DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32),
|
||||
DataConfigurator(ConfLayout::PLN, Precision::FP32) }, { DataConfigurator(ConfLayout::PLN, Precision::FP32) });
|
||||
} else {
|
||||
addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) },
|
||||
{ DataConfigurator(ConfLayout::PLN) });
|
||||
}
|
||||
|
||||
addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) },
|
||||
{ DataConfigurator(ConfLayout::PLN) });
|
||||
} catch (InferenceEngine::details::InferenceEngineException &ex) {
|
||||
errorMsg = ex.what();
|
||||
}
|
||||
|
@ -5,13 +5,19 @@
|
||||
#include "base.hpp"
|
||||
#include "common/defs.h"
|
||||
#include "common/softmax.h"
|
||||
#include "common/cpu_convert.h"
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <ie_parallel.hpp>
|
||||
#include <mkldnn_extension_utils.h>
|
||||
#include "utils/bfloat16.hpp"
|
||||
#include "common/cpu_memcpy.h"
|
||||
#include "jit_generator.hpp"
|
||||
#include "jit_uni_eltwise.hpp"
|
||||
|
||||
using namespace MKLDNNPlugin;
|
||||
using namespace mkldnn;
|
||||
using namespace mkldnn::impl::cpu;
|
||||
using namespace mkldnn::impl::utils;
|
||||
|
||||
@ -22,11 +28,18 @@ namespace Cpu {
|
||||
#define GET_OFF(field) offsetof(jit_args_logistic, field)
|
||||
|
||||
struct jit_args_logistic {
|
||||
const float* src;
|
||||
const float* dst;
|
||||
const void* src;
|
||||
void* dst;
|
||||
size_t work_amount;
|
||||
};
|
||||
|
||||
struct jit_logistic_config_params {
|
||||
InferenceEngine::Precision src_dt;
|
||||
InferenceEngine::Precision dst_dt;
|
||||
unsigned src_data_size;
|
||||
unsigned dst_data_size;
|
||||
};
|
||||
|
||||
struct jit_uni_logistic_kernel {
|
||||
void (*ker_)(const jit_args_logistic *);
|
||||
|
||||
@ -40,7 +53,7 @@ template <cpu_isa_t isa>
|
||||
struct jit_uni_logistic_kernel_f32 : public jit_uni_logistic_kernel, public jit_generator {
|
||||
DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_logistic_kernel_f32)
|
||||
|
||||
jit_uni_logistic_kernel_f32() : jit_uni_logistic_kernel(), jit_generator() {
|
||||
jit_uni_logistic_kernel_f32(jit_logistic_config_params jcp) : jit_uni_logistic_kernel(), jit_generator() {
|
||||
exp_injector.reset(new jit_uni_eltwise_injector_f32<isa>(this, alg_kind::eltwise_exp, 0.f, 0.f));
|
||||
|
||||
this->preamble();
|
||||
@ -59,12 +72,12 @@ struct jit_uni_logistic_kernel_f32 : public jit_uni_logistic_kernel, public jit_
|
||||
cmp(reg_work_amount, step);
|
||||
jl(tail_loop_label, T_NEAR);
|
||||
|
||||
uni_vmovups(vmm_src, ptr[reg_src]);
|
||||
load_vector(vmm_src, ptr[reg_src], jcp.src_dt);
|
||||
compute_kernel();
|
||||
uni_vmovups(ptr[reg_dst], vmm_src);
|
||||
store_vector(ptr[reg_dst], vmm_src, jcp.dst_dt);
|
||||
|
||||
add(reg_src, step * sizeof(float));
|
||||
add(reg_dst, step * sizeof(float));
|
||||
add(reg_src, step * jcp.src_data_size);
|
||||
add(reg_dst, step * jcp.dst_data_size);
|
||||
sub(reg_work_amount, step);
|
||||
|
||||
jmp(main_loop_label, T_NEAR);
|
||||
@ -75,12 +88,12 @@ struct jit_uni_logistic_kernel_f32 : public jit_uni_logistic_kernel, public jit_
|
||||
cmp(reg_work_amount, step);
|
||||
jl(exit_label, T_NEAR);
|
||||
|
||||
movss(xmm_src, ptr[reg_src]);
|
||||
load_scalar(xmm_src, ptr[reg_src], jcp.src_dt);
|
||||
compute_kernel();
|
||||
movss(ptr[reg_dst], xmm_src);
|
||||
store_scalar(ptr[reg_dst], xmm_src, jcp.dst_dt);
|
||||
|
||||
add(reg_src, step * sizeof(float));
|
||||
add(reg_dst, step * sizeof(float));
|
||||
add(reg_src, step * jcp.src_data_size);
|
||||
add(reg_dst, step * jcp.dst_data_size);
|
||||
sub(reg_work_amount, step);
|
||||
|
||||
jmp(tail_loop_label, T_NEAR);
|
||||
@ -164,6 +177,61 @@ private:
|
||||
int mask_sign = 0x80000000; // 0 // mask to extract sign
|
||||
int float_1 = 0x3f800000; // 1 // 1.0f
|
||||
} vals_for_logistic_activate;
|
||||
|
||||
inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, InferenceEngine::Precision src_dt) {
|
||||
switch (src_dt) {
|
||||
case InferenceEngine::Precision::FP32:
|
||||
uni_vmovups(vmm_src, op);
|
||||
break;
|
||||
case InferenceEngine::Precision::BF16:
|
||||
vpmovzxwd(vmm_src, op);
|
||||
uni_vpslld(vmm_src, vmm_src, 16);
|
||||
break;
|
||||
default:
|
||||
assert(!"unknown src_dt");
|
||||
}
|
||||
}
|
||||
inline void store_vector(const Xbyak::Address &op, Vmm vmm_dst, InferenceEngine::Precision dst_dt) {
|
||||
Xbyak::Ymm ymm_dst = Xbyak::Ymm(vmm_dst.getIdx());
|
||||
|
||||
switch (dst_dt) {
|
||||
case InferenceEngine::Precision::FP32:
|
||||
uni_vmovups(op, vmm_dst);
|
||||
break;
|
||||
case InferenceEngine::Precision::BF16:
|
||||
vcvtneps2bf16(ymm_dst, vmm_dst);
|
||||
uni_vmovups(op, ymm_dst);
|
||||
break;
|
||||
default:
|
||||
assert(!"unknown dst_dt");
|
||||
}
|
||||
}
|
||||
inline void load_scalar(Xbyak::Xmm xmm_src, const Xbyak::Address &op, InferenceEngine::Precision src_dt) {
|
||||
switch (src_dt) {
|
||||
case InferenceEngine::Precision::FP32:
|
||||
movss(xmm_src, op);
|
||||
break;
|
||||
case InferenceEngine::Precision::BF16:
|
||||
pinsrw(xmm_src, op, 0x0);
|
||||
uni_vpslld(xmm_src, xmm_src, 16);
|
||||
break;
|
||||
default:
|
||||
assert(!"unknown src_dt");
|
||||
}
|
||||
}
|
||||
inline void store_scalar(const Xbyak::Address &op, Xbyak::Xmm xmm_dst, InferenceEngine::Precision dst_dt) {
|
||||
switch (dst_dt) {
|
||||
case InferenceEngine::Precision::FP32:
|
||||
movss(op, xmm_dst);
|
||||
break;
|
||||
case InferenceEngine::Precision::BF16:
|
||||
uni_vpsrld(xmm_dst, xmm_dst, 16);
|
||||
pextrw(op, xmm_dst, 0x0);
|
||||
break;
|
||||
default:
|
||||
assert(!"unknown dst_dt");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class RegionYoloImpl: public ExtLayerBase {
|
||||
@ -173,27 +241,48 @@ public:
|
||||
if (layer->insData.size() != 1 || layer->outData.empty())
|
||||
THROW_IE_EXCEPTION << "Incorrect number of input/output edges!";
|
||||
|
||||
input_prec = layer->insData.front().lock()->getPrecision();
|
||||
output_prec = layer->outData.front()->getPrecision();
|
||||
|
||||
if (input_prec != Precision::FP32 && input_prec != Precision::BF16) {
|
||||
input_prec = Precision::FP32;
|
||||
}
|
||||
|
||||
if (output_prec != Precision::FP32 && output_prec != Precision::BF16) {
|
||||
output_prec = Precision::FP32;
|
||||
}
|
||||
|
||||
if (Precision::BF16 == output_prec) {
|
||||
if (!mayiuse(avx512_core_bf16)) {
|
||||
output_prec = Precision::FP32;
|
||||
}
|
||||
}
|
||||
|
||||
classes = layer->GetParamAsInt("classes");
|
||||
coords = layer->GetParamAsInt("coords");
|
||||
num = layer->GetParamAsInt("num");
|
||||
do_softmax = layer->GetParamAsBool("do_softmax", true);
|
||||
mask = layer->GetParamAsInts("mask", {});
|
||||
|
||||
jit_logistic_config_params jcp;
|
||||
jcp.src_dt = jcp.dst_dt = output_prec;
|
||||
jcp.src_data_size = jcp.dst_data_size = output_prec.size();
|
||||
|
||||
block_size = 1;
|
||||
if (mayiuse(avx512_common)) {
|
||||
logistic_kernel.reset(new jit_uni_logistic_kernel_f32<avx512_common>());
|
||||
logistic_kernel.reset(new jit_uni_logistic_kernel_f32<avx512_common>(jcp));
|
||||
block_size = 16;
|
||||
} else if (mayiuse(avx2)) {
|
||||
logistic_kernel.reset(new jit_uni_logistic_kernel_f32<avx2>());
|
||||
logistic_kernel.reset(new jit_uni_logistic_kernel_f32<avx2>(jcp));
|
||||
block_size = 8;
|
||||
} else if (mayiuse(sse42)) {
|
||||
logistic_kernel.reset(new jit_uni_logistic_kernel_f32<sse42>());
|
||||
logistic_kernel.reset(new jit_uni_logistic_kernel_f32<sse42>(jcp));
|
||||
block_size = 4;
|
||||
}
|
||||
|
||||
softmax_kernel.reset(new SoftmaxGeneric());
|
||||
softmax_kernel = std::make_shared<SoftmaxGeneric>(input_prec, output_prec);
|
||||
|
||||
addConfig(layer, {DataConfigurator(ConfLayout::PLN)}, {DataConfigurator(ConfLayout::PLN)});
|
||||
addConfig(layer, {DataConfigurator(ConfLayout::PLN, input_prec)}, {DataConfigurator(ConfLayout::PLN, output_prec)});
|
||||
} catch (InferenceEngine::details::InferenceEngineException &ex) {
|
||||
errorMsg = ex.what();
|
||||
}
|
||||
@ -201,19 +290,12 @@ public:
|
||||
|
||||
StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs,
|
||||
ResponseDesc *resp) noexcept override {
|
||||
const auto *src_data = inputs[0]->cbuffer().as<const float *>();
|
||||
auto *dst_data = outputs[0]->buffer().as<float *>();
|
||||
size_t mask_size = mask.size();
|
||||
|
||||
int mask_size = mask.size();
|
||||
|
||||
int IW = (inputs[0]->getTensorDesc().getDims().size() > 3) ? inputs[0]->getTensorDesc().getDims()[3] : 1;
|
||||
int IH = (inputs[0]->getTensorDesc().getDims().size() > 2) ? inputs[0]->getTensorDesc().getDims()[2] : 1;
|
||||
int IC = (inputs[0]->getTensorDesc().getDims().size() > 1) ? inputs[0]->getTensorDesc().getDims()[1] : 1;
|
||||
int B = (inputs[0]->getTensorDesc().getDims().size() > 0) ? inputs[0]->getTensorDesc().getDims()[0] : 1;
|
||||
|
||||
parallel_for(B * IC * IH * IW, [&](int i) {
|
||||
dst_data[i] = src_data[i];
|
||||
});
|
||||
size_t IW = (inputs[0]->getTensorDesc().getDims().size() > 3) ? inputs[0]->getTensorDesc().getDims()[3] : 1;
|
||||
size_t IH = (inputs[0]->getTensorDesc().getDims().size() > 2) ? inputs[0]->getTensorDesc().getDims()[2] : 1;
|
||||
size_t IC = (inputs[0]->getTensorDesc().getDims().size() > 1) ? inputs[0]->getTensorDesc().getDims()[1] : 1;
|
||||
size_t B = (inputs[0]->getTensorDesc().getDims().size() > 0) ? inputs[0]->getTensorDesc().getDims()[0] : 1;
|
||||
|
||||
int end_index = 0;
|
||||
int num_ = 0;
|
||||
@ -226,26 +308,41 @@ public:
|
||||
end_index = IW * IH * (classes + 1);
|
||||
num_ = mask_size;
|
||||
}
|
||||
int inputs_size = IH * IW * num_ * (classes + coords + 1);
|
||||
int total_size = 2 * IH * IW;
|
||||
size_t inputs_size = IH * IW * num_ * (classes + coords + 1);
|
||||
size_t total_size = 2 * IH * IW;
|
||||
|
||||
for (int b = 0; b < B; b++) {
|
||||
for (int n = 0; n < num_; n++) {
|
||||
int index = b * inputs_size + n * IW * IH * (classes + coords + 1);
|
||||
calculate_logistic(index, total_size, dst_data);
|
||||
const auto *src_data = inputs[0]->cbuffer().as<const uint8_t *>();
|
||||
auto *dst_data = outputs[0]->buffer().as<uint8_t *>();
|
||||
|
||||
index = b * inputs_size + IW * IH * (n * (classes + coords + 1) + coords);
|
||||
calculate_logistic(index, end_index, dst_data);
|
||||
try {
|
||||
cpu_convert(src_data, dst_data, inputs[0]->getTensorDesc().getPrecision(), outputs[0]->getTensorDesc().getPrecision(), B * IC * IH * IW);
|
||||
|
||||
for (int b = 0; b < B; b++) {
|
||||
for (int n = 0; n < num_; n++) {
|
||||
size_t index = b * inputs_size + n * IW * IH * (classes + coords + 1);
|
||||
calculate_logistic(index, total_size, dst_data);
|
||||
|
||||
index = b * inputs_size + IW * IH * (n * (classes + coords + 1) + coords);
|
||||
calculate_logistic(index, end_index, dst_data);
|
||||
}
|
||||
}
|
||||
|
||||
if (do_softmax) {
|
||||
int index = IW * IH * (coords + 1);
|
||||
int batch_offset = inputs_size / num;
|
||||
for (int b = 0; b < B * num; b++) {
|
||||
softmax_kernel->execute(src_data + input_prec.size() * (index + b * batch_offset),
|
||||
dst_data + output_prec.size() * (index + b * batch_offset), 1, classes, IH, IW);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (do_softmax) {
|
||||
int index = IW * IH * (coords + 1);
|
||||
int batch_offset = inputs_size / num;
|
||||
for (int b = 0; b < B * num; b++)
|
||||
softmax_kernel->execute(src_data + index + b * batch_offset, dst_data + index + b * batch_offset, 1, classes, IH, IW);
|
||||
catch (const std::exception& excp) {
|
||||
snprintf(resp->msg, sizeof(resp->msg), "%s", excp.what());
|
||||
return GENERAL_ERROR;
|
||||
}
|
||||
catch(...) {
|
||||
return GENERAL_ERROR;
|
||||
}
|
||||
|
||||
return OK;
|
||||
}
|
||||
|
||||
@ -255,6 +352,7 @@ private:
|
||||
int num;
|
||||
float do_softmax;
|
||||
std::vector<int> mask;
|
||||
Precision input_prec, output_prec;
|
||||
|
||||
int block_size;
|
||||
std::shared_ptr<jit_uni_logistic_kernel> logistic_kernel;
|
||||
@ -281,7 +379,9 @@ private:
|
||||
return src;
|
||||
}
|
||||
|
||||
inline void calculate_logistic(int start_index, int count, float* dst_data) {
|
||||
|
||||
inline void calculate_logistic(size_t start_index, int count, uint8_t * dst_data) {
|
||||
auto dst_data_size = output_prec.size();
|
||||
if (logistic_kernel) {
|
||||
int blocks_num = div_up(count, block_size);
|
||||
parallel_for(blocks_num, [&](int ib) {
|
||||
@ -289,15 +389,24 @@ private:
|
||||
int work_amount = std::min(count - idx, block_size);
|
||||
|
||||
auto arg = jit_args_logistic();
|
||||
arg.src = dst_data + start_index + idx;
|
||||
arg.dst = dst_data + start_index + idx;
|
||||
arg.src = arg.dst = dst_data + dst_data_size * (start_index + idx);
|
||||
arg.work_amount = static_cast<size_t>(work_amount);
|
||||
|
||||
(*logistic_kernel)(&arg);
|
||||
});
|
||||
} else {
|
||||
for (int i = 0; i < count; i++) {
|
||||
dst_data[i + start_index] = logistic_scalar(dst_data[i + start_index]);
|
||||
if (Precision::FP32 == output_prec) {
|
||||
auto float_dst_data = reinterpret_cast<float*>(dst_data);
|
||||
for (int i = 0; i < count; i++) {
|
||||
float_dst_data[i + start_index] = logistic_scalar(float_dst_data[i + start_index]);
|
||||
}
|
||||
} else if (Precision::BF16 == output_prec) {
|
||||
auto bf16_dst_data = reinterpret_cast<bfloat16_t*>(dst_data);
|
||||
for (int i = 0; i < count; i++) {
|
||||
bf16_dst_data[i + start_index] = logistic_scalar(bf16_dst_data[i + start_index]);
|
||||
}
|
||||
} else {
|
||||
THROW_IE_EXCEPTION << "Unsupported precision configuration outPrc=" << output_prec.name();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -18,7 +18,7 @@ public:
|
||||
|
||||
stride = layer->GetParamAsInt("stride");
|
||||
|
||||
addConfig(layer, {DataConfigurator(ConfLayout::PLN)}, {DataConfigurator(ConfLayout::PLN)});
|
||||
addConfig(layer, {DataConfigurator(ConfLayout::PLN, Precision::FP32)}, {DataConfigurator(ConfLayout::PLN, Precision::FP32)});
|
||||
} catch (InferenceEngine::details::InferenceEngineException &ex) {
|
||||
errorMsg = ex.what();
|
||||
}
|
||||
|
@ -23,10 +23,12 @@ public:
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output edges!";
|
||||
|
||||
src_dims = layer->insData[REVERSESEQUENCE_DATA].lock()->getTensorDesc().getDims();
|
||||
|
||||
Precision lengthsPrecision = layer->insData[REVERSESEQUENCE_LENGTHS].lock()->getTensorDesc().getPrecision();
|
||||
if (lengthsPrecision != Precision::I32 && lengthsPrecision != Precision::FP32)
|
||||
lengthsPrecision = Precision::I32;
|
||||
|
||||
SizeVector seq_lengths_dims = layer->insData[REVERSESEQUENCE_LENGTHS].lock()->getTensorDesc().getDims();
|
||||
if (layer->insData[REVERSESEQUENCE_LENGTHS].lock()->getTensorDesc().getPrecision() != Precision::I32 &&
|
||||
layer->insData[REVERSESEQUENCE_LENGTHS].lock()->getTensorDesc().getPrecision() != Precision::FP32)
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect 'seq_lengths' input precision. Only FP32 and I32 are supported!";
|
||||
if (seq_lengths_dims.size() > 1)
|
||||
THROW_IE_EXCEPTION << layer->name << " Seq_lengths vector should be 1 dimension";
|
||||
|
||||
@ -60,7 +62,7 @@ public:
|
||||
work_amount_dst = srcStrides[0] * src_dims[0];
|
||||
|
||||
addConfig(layer,
|
||||
{ DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN) },
|
||||
{ DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, lengthsPrecision) },
|
||||
{ DataConfigurator(ConfLayout::PLN, Precision::FP32) });
|
||||
} catch (InferenceEngine::details::InferenceEngineException &ex) {
|
||||
errorMsg = ex.what();
|
||||
|
@ -328,8 +328,8 @@ public:
|
||||
pooled_height_ = output_dim_;
|
||||
pooled_width_ = output_dim_;
|
||||
|
||||
std::vector<DataConfigurator> inputs_layouts(layer->insData.size(), DataConfigurator(ConfLayout::PLN));
|
||||
std::vector<DataConfigurator> outputs_layouts(layer->outData.size(), DataConfigurator(ConfLayout::PLN));
|
||||
std::vector<DataConfigurator> inputs_layouts(layer->insData.size(), DataConfigurator(ConfLayout::PLN, Precision::FP32));
|
||||
std::vector<DataConfigurator> outputs_layouts(layer->outData.size(), DataConfigurator(ConfLayout::PLN, Precision::FP32));
|
||||
addConfig(layer, inputs_layouts, outputs_layouts);
|
||||
} catch (InferenceEngine::details::InferenceEngineException &ex) {
|
||||
errorMsg = ex.what();
|
||||
|
@ -31,8 +31,12 @@ public:
|
||||
|
||||
broadcast = layer->GetParamAsString("auto_broadcast", "numpy");
|
||||
|
||||
if (layer->insData[THEN].lock()->getTensorDesc().getPrecision() != layer->insData[ELSE].lock()->getTensorDesc().getPrecision())
|
||||
THROW_IE_EXCEPTION << "Select layer with name '" << layer->name << "' has different precisions on 'Then' and 'Else' inputs";
|
||||
auto inputPrecision = layer->insData[THEN].lock()->getTensorDesc().getPrecision();
|
||||
if (inputPrecision == Precision::BF16 || layer->insData[ELSE].lock()->getTensorDesc().getPrecision() == Precision::BF16) {
|
||||
inputPrecision = Precision::BF16;
|
||||
} else if (layer->insData[THEN].lock()->getTensorDesc().getPrecision() != layer->insData[ELSE].lock()->getTensorDesc().getPrecision()) {
|
||||
THROW_IE_EXCEPTION << "Select layer with name '" << layer->name << "' has different precisions on 'Then' and 'Else' inputs ";
|
||||
}
|
||||
|
||||
const auto& conditionPrecision = layer->insData[CONDITION].lock()->getTensorDesc().getPrecision();
|
||||
if (conditionPrecision != Precision::BOOL && conditionPrecision != Precision::I32 && conditionPrecision != Precision::U8)
|
||||
@ -100,7 +104,7 @@ public:
|
||||
inConfig.inPlace = -1;
|
||||
inConfig.constant = false;
|
||||
|
||||
Precision inPrecision = layer->insData[i].lock()->getTensorDesc().getPrecision();
|
||||
Precision inPrecision = i == CONDITION ? conditionPrecision : inputPrecision;
|
||||
const SizeVector& inDims = layer->insData[i].lock()->getTensorDesc().getDims();
|
||||
inConfig.desc = TensorDesc(inPrecision, inDims, InferenceEngine::TensorDesc::getLayoutByDims(inDims));
|
||||
|
||||
@ -110,9 +114,8 @@ public:
|
||||
DataConfig outConfig;
|
||||
outConfig.inPlace = -1;
|
||||
outConfig.constant = false;
|
||||
Precision outPrecision = layer->insData[1].lock()->getTensorDesc().getPrecision();
|
||||
const SizeVector& outDims = layer->outData[0]->getTensorDesc().getDims();
|
||||
outConfig.desc = TensorDesc(outPrecision, outDims, InferenceEngine::TensorDesc::getLayoutByDims(outDims));
|
||||
outConfig.desc = TensorDesc(inputPrecision, outDims, InferenceEngine::TensorDesc::getLayoutByDims(outDims));
|
||||
config.outConfs.push_back(outConfig);
|
||||
|
||||
config.dynBatchSupport = false;
|
||||
|
@ -225,8 +225,8 @@ public:
|
||||
layer->insData[0].lock()->getTensorDesc().getDims().size() != 4)
|
||||
THROW_IE_EXCEPTION << "Unsupported dimensions!";
|
||||
|
||||
addConfig(layer, {DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)},
|
||||
{DataConfigurator(ConfLayout::PLN)});
|
||||
addConfig(layer, {DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32),
|
||||
DataConfigurator(ConfLayout::PLN, Precision::FP32)}, {DataConfigurator(ConfLayout::PLN, Precision::FP32)});
|
||||
} catch (InferenceEngine::details::InferenceEngineException &ex) {
|
||||
errorMsg = ex.what();
|
||||
}
|
||||
|
@ -54,15 +54,11 @@ public:
|
||||
auto inData = spaceToBatchLayer->insData[i].lock();
|
||||
if (inData == nullptr)
|
||||
THROW_IE_EXCEPTION << "'" << spaceToBatchLayer->name << "' layer has nullable input data";
|
||||
config.inConfs[i].desc = TensorDesc(inData->getTensorDesc().getPrecision(),
|
||||
inData->getTensorDesc().getDims(),
|
||||
inData->getTensorDesc().getLayout());
|
||||
config.inConfs[i].desc = TensorDesc(precision, inData->getTensorDesc().getDims(), inData->getTensorDesc().getLayout());
|
||||
}
|
||||
|
||||
DataConfig outConfig;
|
||||
outConfig.desc = TensorDesc(layer->outData[0]->getTensorDesc().getPrecision(),
|
||||
out_dims,
|
||||
layer->outData[0]->getTensorDesc().getLayout());
|
||||
outConfig.desc = TensorDesc(precision, out_dims, layer->outData[0]->getTensorDesc().getLayout());
|
||||
config.outConfs.push_back(outConfig);
|
||||
config.dynBatchSupport = false;
|
||||
confs.push_back(config);
|
||||
|
@ -25,11 +25,6 @@ public:
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output edges!";
|
||||
}
|
||||
|
||||
Precision input_indices_precision = layer->insData[INPUT_INDICES_PORT].lock()->getTensorDesc().getPrecision();
|
||||
if (input_indices_precision != Precision::FP32) {
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect input precision. Only FP32 is supported!";
|
||||
}
|
||||
|
||||
// check dimensions of input tensors
|
||||
SizeVector input_indices_dims = layer->insData[INPUT_INDICES_PORT].lock()->getTensorDesc().getDims();
|
||||
if (input_indices_dims.size() != 2 || input_indices_dims[1] != 2) {
|
||||
@ -75,8 +70,10 @@ public:
|
||||
|
||||
// TODO: check that dense shape value is set
|
||||
addConfig(layer,
|
||||
{DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)},
|
||||
{DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)});
|
||||
{DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32),
|
||||
DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32)},
|
||||
{DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32),
|
||||
DataConfigurator(ConfLayout::PLN, Precision::FP32)});
|
||||
}
|
||||
catch (InferenceEngine::details::InferenceEngineException &ex) {
|
||||
errorMsg = ex.what();
|
||||
|
@ -38,20 +38,6 @@ public:
|
||||
else
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect SparseSegmentReduce layer type!";
|
||||
|
||||
// check a precision of input tensors
|
||||
Precision input_data_precision = layer->insData[INPUT_DATA_PORT].lock()->getTensorDesc().getPrecision();
|
||||
if (input_data_precision != Precision::FP32) {
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect precision of the input data. Only FP32 is supported!";
|
||||
}
|
||||
Precision input_indices_precision = layer->insData[INPUT_INDICES_PORT].lock()->getTensorDesc().getPrecision();
|
||||
if (input_indices_precision != Precision::FP32) {
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect precision of the input indices. Only FP32 is supported!";
|
||||
}
|
||||
Precision input_segment_ids_precision = layer->insData[INPUT_SEGMENT_IDS_PORT].lock()->getTensorDesc().getPrecision();
|
||||
if (input_segment_ids_precision != Precision::FP32) {
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect precision of segment IDs. Only FP32 is supported!";
|
||||
}
|
||||
|
||||
// check shapes of the second and third input tensors
|
||||
input_indices_dims = layer->insData[INPUT_INDICES_PORT].lock()->getTensorDesc().getDims();
|
||||
if (input_indices_dims.size() != 1) {
|
||||
@ -65,12 +51,6 @@ public:
|
||||
THROW_IE_EXCEPTION << layer->name << " Shapes for input indices and segment IDs must match.";
|
||||
}
|
||||
|
||||
// check a precision of output tensor
|
||||
Precision output_precision = layer->insData[OUTPUT_PORT].lock()->getTensorDesc().getPrecision();
|
||||
if (output_precision != Precision::FP32) {
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect precision of output data. Only FP32 is supported!";
|
||||
}
|
||||
|
||||
// check shapes of output tensor
|
||||
input_data_dims = layer->insData[INPUT_DATA_PORT].lock()->getTensorDesc().getDims();
|
||||
output_dims = layer->outData[OUTPUT_PORT]->getTensorDesc().getDims();
|
||||
@ -88,8 +68,8 @@ public:
|
||||
|
||||
// confugure layouts of input and output ports
|
||||
addConfig(layer,
|
||||
{ DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) },
|
||||
{ DataConfigurator(ConfLayout::PLN) });
|
||||
{ DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32),
|
||||
DataConfigurator(ConfLayout::PLN, Precision::FP32) }, { DataConfigurator(ConfLayout::PLN, Precision::FP32) });
|
||||
}
|
||||
catch (InferenceEngine::details::InferenceEngineException &ex) {
|
||||
errorMsg = ex.what();
|
||||
|
@ -28,26 +28,6 @@ public:
|
||||
with_default_value = true;
|
||||
}
|
||||
|
||||
// check precisions for input tensors
|
||||
Precision input_indices_precision = layer->insData[INPUT_INDICES_PORT].lock()->getTensorDesc().getPrecision();
|
||||
if (input_indices_precision != Precision::I32) {
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect input precision for input indices. Only I32 is supported!";
|
||||
}
|
||||
Precision input_dense_shape_precision = layer->insData[INPUT_DENSE_SHAPE_PORT].lock()->getTensorDesc().getPrecision();
|
||||
if (input_dense_shape_precision != Precision::I32) {
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect input precision for input dense shape. Only I32 is supported!";
|
||||
}
|
||||
Precision input_values_precision = layer->insData[INPUT_VALUES_PORT].lock()->getTensorDesc().getPrecision();
|
||||
if (input_values_precision != Precision::I32) {
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect input precision for input values. Only I32 is supported!";
|
||||
}
|
||||
if (with_default_value) {
|
||||
Precision input_default_value_precision = layer->insData[INPUT_DEFAULT_VALUE_PORT].lock()->getTensorDesc().getPrecision();
|
||||
if (input_default_value_precision != Precision::I32) {
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect input precision for input default value. Only I32 is supported!";
|
||||
}
|
||||
}
|
||||
|
||||
// check dimensions of input tensors
|
||||
SizeVector input_dense_shape_dims = layer->insData[INPUT_DENSE_SHAPE_PORT].lock()->getTensorDesc().getDims();
|
||||
if (input_dense_shape_dims.size() != 1 || input_dense_shape_dims[0] < 1) {
|
||||
@ -73,14 +53,14 @@ public:
|
||||
// TODO: check that dense shape value is set
|
||||
if (with_default_value) {
|
||||
addConfig(layer,
|
||||
{ DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN),
|
||||
DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) },
|
||||
{ DataConfigurator(ConfLayout::PLN) });
|
||||
{ DataConfigurator(ConfLayout::PLN, Precision::I32), DataConfigurator(ConfLayout::PLN, Precision::I32),
|
||||
DataConfigurator(ConfLayout::PLN, Precision::I32), DataConfigurator(ConfLayout::PLN, Precision::I32) },
|
||||
{ DataConfigurator(ConfLayout::PLN, Precision::I32) });
|
||||
} else {
|
||||
addConfig(layer,
|
||||
{ DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN),
|
||||
DataConfigurator(ConfLayout::PLN) },
|
||||
{ DataConfigurator(ConfLayout::PLN) });
|
||||
{ DataConfigurator(ConfLayout::PLN, Precision::I32), DataConfigurator(ConfLayout::PLN, Precision::I32),
|
||||
DataConfigurator(ConfLayout::PLN, Precision::I32) },
|
||||
{ DataConfigurator(ConfLayout::PLN, Precision::I32) });
|
||||
}
|
||||
}
|
||||
catch (InferenceEngine::details::InferenceEngineException &ex) {
|
||||
|
@ -127,14 +127,15 @@ public:
|
||||
// TODO: check that dense shape value is set
|
||||
if (with_weights) {
|
||||
addConfig(layer,
|
||||
{ DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN),
|
||||
DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) },
|
||||
{ DataConfigurator(ConfLayout::PLN) });
|
||||
{ DataConfigurator(ConfLayout::PLN, Precision::I32), DataConfigurator(ConfLayout::PLN, Precision::I32),
|
||||
DataConfigurator(ConfLayout::PLN, Precision::I32),
|
||||
DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::I32),
|
||||
DataConfigurator(ConfLayout::PLN, Precision::FP32) }, { DataConfigurator(ConfLayout::PLN, Precision::FP32) });
|
||||
} else {
|
||||
addConfig(layer,
|
||||
{ DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN),
|
||||
DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) },
|
||||
{ DataConfigurator(ConfLayout::PLN) });
|
||||
{ DataConfigurator(ConfLayout::PLN, Precision::I32), DataConfigurator(ConfLayout::PLN, Precision::I32),
|
||||
DataConfigurator(ConfLayout::PLN, Precision::I32), DataConfigurator(ConfLayout::PLN, Precision::FP32),
|
||||
DataConfigurator(ConfLayout::PLN, Precision::I32) }, { DataConfigurator(ConfLayout::PLN, Precision::FP32) });
|
||||
}
|
||||
}
|
||||
catch (InferenceEngine::details::InferenceEngineException &ex) {
|
||||
|
@ -35,8 +35,6 @@ public:
|
||||
begin_dims = {};
|
||||
if (layer->insData.size() > 1) {
|
||||
begin_dims = layer->insData[STRIDEDSLICE_BEGIN].lock()->getTensorDesc().getDims();
|
||||
if (layer->insData[STRIDEDSLICE_BEGIN].lock()->getTensorDesc().getPrecision() != Precision::I32)
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect 'begin' input precision. Only I32 is supported!";
|
||||
if (begin_dims.size() > 1)
|
||||
THROW_IE_EXCEPTION << layer->name << " Begin vector should be 1 dimension";
|
||||
bounds_size = begin_dims[0];
|
||||
@ -44,8 +42,6 @@ public:
|
||||
|
||||
if (layer->insData.size() > 2) {
|
||||
end_dims = layer->insData[STRIDEDSLICE_END].lock()->getTensorDesc().getDims();
|
||||
if (layer->insData[STRIDEDSLICE_END].lock()->getTensorDesc().getPrecision() != Precision::I32)
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect 'end' input precision. Only I32 is supported!";
|
||||
if (end_dims.size() > 1)
|
||||
THROW_IE_EXCEPTION << layer->name << " End vector should be 1 dimension";
|
||||
if (begin_dims[0] != end_dims[0])
|
||||
@ -54,8 +50,6 @@ public:
|
||||
|
||||
if (layer->insData.size() > 3) {
|
||||
stride_dims = layer->insData[STRIDEDSLICE_STRIDE].lock()->getTensorDesc().getDims();
|
||||
if (layer->insData[STRIDEDSLICE_STRIDE].lock()->getTensorDesc().getPrecision() != Precision::I32)
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect 'strides' input precision. Only I32 is supported!";
|
||||
if (stride_dims.size() > 1)
|
||||
THROW_IE_EXCEPTION << layer->name << " End vector should be 1 dimension";
|
||||
if (begin_dims[0] != stride_dims[0])
|
||||
@ -134,16 +128,19 @@ public:
|
||||
|
||||
srcStrides = layer->insData[STRIDEDSLICE_DATA].lock()->getTensorDesc().getBlockingDesc().getStrides();
|
||||
dstStrides = layer->outData[0]->getTensorDesc().getBlockingDesc().getStrides();
|
||||
Precision dataPrecision = layer->insData[STRIDEDSLICE_DATA].lock()->getTensorDesc().getPrecision();
|
||||
if (layer->insData.size() == 1) {
|
||||
addConfig(layer, { DataConfigurator(ConfLayout::PLN) }, { DataConfigurator(ConfLayout::PLN) });
|
||||
addConfig(layer, { DataConfigurator(ConfLayout::PLN, dataPrecision) }, { DataConfigurator(ConfLayout::PLN, dataPrecision) });
|
||||
} else if (layer->insData.size() == 2) {
|
||||
addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) }, { DataConfigurator(ConfLayout::PLN) });
|
||||
addConfig(layer, { DataConfigurator(ConfLayout::PLN, dataPrecision), DataConfigurator(ConfLayout::PLN, Precision::I32) },
|
||||
{ DataConfigurator(ConfLayout::PLN, dataPrecision) });
|
||||
} else if (layer->insData.size() == 3) {
|
||||
addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) },
|
||||
{ DataConfigurator(ConfLayout::PLN) });
|
||||
addConfig(layer, { DataConfigurator(ConfLayout::PLN, dataPrecision), DataConfigurator(ConfLayout::PLN, Precision::I32),
|
||||
DataConfigurator(ConfLayout::PLN, Precision::I32) }, { DataConfigurator(ConfLayout::PLN, dataPrecision) });
|
||||
} else {
|
||||
addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN),
|
||||
DataConfigurator(ConfLayout::PLN) }, { DataConfigurator(ConfLayout::PLN) });
|
||||
addConfig(layer, { DataConfigurator(ConfLayout::PLN, dataPrecision), DataConfigurator(ConfLayout::PLN, Precision::I32),
|
||||
DataConfigurator(ConfLayout::PLN, Precision::I32), DataConfigurator(ConfLayout::PLN, Precision::I32) },
|
||||
{ DataConfigurator(ConfLayout::PLN, dataPrecision) });
|
||||
}
|
||||
} catch (InferenceEngine::details::InferenceEngineException &ex) {
|
||||
errorMsg = ex.what();
|
||||
@ -151,8 +148,6 @@ public:
|
||||
}
|
||||
|
||||
StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, ResponseDesc *resp) noexcept override {
|
||||
const float *src_data = inputs[STRIDEDSLICE_DATA]->cbuffer().as<const float *>() +
|
||||
inputs[STRIDEDSLICE_DATA]->getTensorDesc().getBlockingDesc().getOffsetPadding();
|
||||
int *begin = nullptr, *end = nullptr, *stride = nullptr;
|
||||
if (begin_dims.size())
|
||||
begin = inputs[STRIDEDSLICE_BEGIN]->cbuffer().as<int *>() + inputs[STRIDEDSLICE_BEGIN]->getTensorDesc().getBlockingDesc().getOffsetPadding();
|
||||
@ -160,17 +155,12 @@ public:
|
||||
end = inputs[STRIDEDSLICE_END]->cbuffer().as<int *>() + inputs[STRIDEDSLICE_END]->getTensorDesc().getBlockingDesc().getOffsetPadding();
|
||||
if (stride_dims.size())
|
||||
stride = inputs[STRIDEDSLICE_STRIDE]->cbuffer().as<int *>() + inputs[STRIDEDSLICE_STRIDE]->getTensorDesc().getBlockingDesc().getOffsetPadding();
|
||||
float* dst_data = outputs[0]->cbuffer().as<float *>() +
|
||||
outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
|
||||
|
||||
InferenceEngine::SizeVector src_dims = inputs[STRIDEDSLICE_DATA]->getTensorDesc().getDims();
|
||||
InferenceEngine::SizeVector srcStrides = inputs[STRIDEDSLICE_DATA]->getTensorDesc().getBlockingDesc().getStrides();
|
||||
InferenceEngine::SizeVector dst_dims = outputs[0]->getTensorDesc().getDims();
|
||||
InferenceEngine::SizeVector dstStrides = outputs[0]->getTensorDesc().getBlockingDesc().getStrides();
|
||||
|
||||
auto dst_size = outputs[0]->byteSize();
|
||||
memset(dst_data, 0, dst_size);
|
||||
|
||||
size_t i, j, k, bj, ej, sj;
|
||||
InferenceEngine::SizeVector our_dims;
|
||||
InferenceEngine::SizeVector out_dims;
|
||||
@ -231,13 +221,49 @@ public:
|
||||
return PARAMETER_MISMATCH;
|
||||
}
|
||||
|
||||
const size_t inputsPrecSize = inputs[STRIDEDSLICE_DATA]->getTensorDesc().getPrecision().size();
|
||||
if (static_cast<int>(src_dims.size()) == max_dims && shrink_axis == 0 &&
|
||||
stride_dms[stride_dms.size()-1] == 1 && stride_dms.size() > 1)
|
||||
strided_slice_vp(src_data, dst_data);
|
||||
else if (static_cast<int>(src_dims.size()) == max_dims && shrink_axis == 0)
|
||||
strided_slice_p(src_data, dst_data);
|
||||
else
|
||||
strided_slice(src_data, dst_data, our_dims);
|
||||
stride_dms[stride_dms.size()-1] == 1 && stride_dms.size() > 1) {
|
||||
if (inputsPrecSize != outputs[0]->getTensorDesc().getPrecision().size()) {
|
||||
if (resp) {
|
||||
std::string errorMsg = "StridedSlice layer doesn't support 'Data' input precision: "
|
||||
+ std::string(inputs[STRIDEDSLICE_DATA]->getTensorDesc().getPrecision().name());
|
||||
errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
|
||||
}
|
||||
return GENERAL_ERROR;
|
||||
}
|
||||
strided_slice_vp(inputs[STRIDEDSLICE_DATA], outputs[0]);
|
||||
} else if (static_cast<int>(src_dims.size()) == max_dims && shrink_axis == 0) {
|
||||
switch (inputsPrecSize) {
|
||||
case 1: { strided_slice_p<uint8_t>(inputs[STRIDEDSLICE_DATA], outputs[0]); break; }
|
||||
case 2: { strided_slice_p<uint16_t>(inputs[STRIDEDSLICE_DATA], outputs[0]); break; }
|
||||
case 4: { strided_slice_p<uint32_t>(inputs[STRIDEDSLICE_DATA], outputs[0]); break; }
|
||||
case 8: { strided_slice_p<uint64_t>(inputs[STRIDEDSLICE_DATA], outputs[0]); break; }
|
||||
default: {
|
||||
if (resp) {
|
||||
std::string errorMsg = "StridedSlice layer doesn't support 'Data' input precision: "
|
||||
+ std::string(inputs[STRIDEDSLICE_DATA]->getTensorDesc().getPrecision().name());
|
||||
errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
|
||||
}
|
||||
return GENERAL_ERROR;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
switch (inputsPrecSize) {
|
||||
case 1: { strided_slice<uint8_t>(inputs[STRIDEDSLICE_DATA], outputs[0], our_dims); break; }
|
||||
case 2: { strided_slice<uint16_t>(inputs[STRIDEDSLICE_DATA], outputs[0], our_dims); break; }
|
||||
case 4: { strided_slice<uint32_t>(inputs[STRIDEDSLICE_DATA], outputs[0], our_dims); break; }
|
||||
case 8: { strided_slice<uint64_t>(inputs[STRIDEDSLICE_DATA], outputs[0], our_dims); break; }
|
||||
default: {
|
||||
if (resp) {
|
||||
std::string errorMsg = "StridedSlice layer doesn't support 'Data' input precision: "
|
||||
+ std::string(inputs[STRIDEDSLICE_DATA]->getTensorDesc().getPrecision().name());
|
||||
errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
|
||||
}
|
||||
return GENERAL_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return OK;
|
||||
}
|
||||
@ -248,9 +274,11 @@ private:
|
||||
const size_t STRIDEDSLICE_END = 2;
|
||||
const size_t STRIDEDSLICE_STRIDE = 3;
|
||||
|
||||
void strided_slice(const float *src_data, float* dst_data, std::vector<size_t> &dims);
|
||||
void strided_slice_vp(const float *src_data, float* dst_data);
|
||||
void strided_slice_p(const float *src_data, float* dst_data);
|
||||
template <typename T>
|
||||
void strided_slice(Blob::Ptr&, Blob::Ptr& dst_data, std::vector<size_t> &dims);
|
||||
void strided_slice_vp(Blob::Ptr&, Blob::Ptr& dst_data);
|
||||
template <typename T>
|
||||
void strided_slice_p(Blob::Ptr&, Blob::Ptr& dst_data);
|
||||
|
||||
SizeVector begin_dims;
|
||||
SizeVector end_dims;
|
||||
@ -275,7 +303,13 @@ private:
|
||||
int ellipsis_pos1, ellipsis_pos2;
|
||||
};
|
||||
|
||||
void StridedSliceImpl::strided_slice(const float *src_data, float* dst_data, std::vector<size_t> &dims) {
|
||||
template <typename T>
|
||||
void StridedSliceImpl::strided_slice(Blob::Ptr& input, Blob::Ptr& output, std::vector<size_t> &dims) {
|
||||
auto* src_data = input->cbuffer().as<const T*>() + input->getTensorDesc().getBlockingDesc().getOffsetPadding();
|
||||
auto* dst_data = output->buffer().as<T*>() + output->getTensorDesc().getBlockingDesc().getOffsetPadding();
|
||||
auto dst_size = output->byteSize();
|
||||
memset(dst_data, 0, dst_size);
|
||||
|
||||
size_t work_amount_dst = dstStrides[0] * dst_dims[0];
|
||||
parallel_nt(0, [&](const int ithr, const int nthr) {
|
||||
int j;
|
||||
@ -306,10 +340,16 @@ void StridedSliceImpl::strided_slice(const float *src_data, float* dst_data, std
|
||||
});
|
||||
}
|
||||
|
||||
void StridedSliceImpl::strided_slice_vp(const float *src_data, float* dst_data) {
|
||||
void StridedSliceImpl::strided_slice_vp(Blob::Ptr& input, Blob::Ptr& output) {
|
||||
size_t dataSize = input->getTensorDesc().getPrecision().size();
|
||||
const uint8_t* src_data = input->cbuffer().as<const uint8_t*>() + input->getTensorDesc().getBlockingDesc().getOffsetPadding() * dataSize;
|
||||
uint8_t* dst_data = output->buffer().as<uint8_t*>() + output->getTensorDesc().getBlockingDesc().getOffsetPadding() * dataSize;
|
||||
auto dst_size = output->byteSize();
|
||||
memset(dst_data, 0, dst_size);
|
||||
|
||||
// Vectorized copy
|
||||
size_t dims_size_1 = dst_dims.size() - 1;
|
||||
size_t dataLength = dst_dims[dims_size_1];
|
||||
size_t len = dst_dims[dims_size_1] * dataSize;
|
||||
size_t work_amount_dst = dstStrides[0] * dst_dims[0] / dst_dims[dims_size_1];
|
||||
|
||||
parallel_nt(0, [&](const int ithr, const int nthr) {
|
||||
@ -323,8 +363,8 @@ void StridedSliceImpl::strided_slice_vp(const float *src_data, float* dst_data)
|
||||
i /= dst_dims[j];
|
||||
}
|
||||
|
||||
for (size_t iwork = start, dst_idx = start * dataLength, i = 1; iwork < end; ++iwork, dst_idx += dataLength) {
|
||||
cpu_memcpy(&dst_data[dst_idx], &src_data[src_idx], sizeof(float) * dataLength);
|
||||
for (size_t iwork = start, dst_idx = start * len, i = 1; iwork < end; ++iwork, dst_idx += len) {
|
||||
cpu_memcpy(&dst_data[dst_idx], &src_data[src_idx * dataSize], len);
|
||||
for (int j = dims_size_1 - 1; j >= 0; j--) {
|
||||
counters[j]++;
|
||||
if (counters[j] < dst_dims[j]) {
|
||||
@ -342,7 +382,13 @@ void StridedSliceImpl::strided_slice_vp(const float *src_data, float* dst_data)
|
||||
});
|
||||
}
|
||||
|
||||
void StridedSliceImpl::strided_slice_p(const float *src_data, float* dst_data) {
|
||||
template <typename T>
|
||||
void StridedSliceImpl::strided_slice_p(Blob::Ptr& input, Blob::Ptr& output) {
|
||||
auto* src_data = input->cbuffer().as<const T*>() + input->getTensorDesc().getBlockingDesc().getOffsetPadding();
|
||||
auto* dst_data = output->buffer().as<T*>() + output->getTensorDesc().getBlockingDesc().getOffsetPadding();
|
||||
auto dst_size = output->byteSize();
|
||||
memset(dst_data, 0, dst_size);
|
||||
|
||||
size_t dims_size = dst_dims.size();
|
||||
size_t work_amount_dst = dstStrides[0] * dst_dims[0];
|
||||
|
||||
|
@ -30,14 +30,6 @@ public:
|
||||
if (layer->outData.size() != 1 && layer->outData.size() != 2)
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect number of output edges!";
|
||||
|
||||
// DataConfigurator::addConfig will automatically change BF16 datatype to FP32
|
||||
// it can be changed back by explicit modification like confs.back().outConfs[i].desc.setPrecision(Precision::BF16);
|
||||
// if current layer supports BF16 naturally. usually they are not and nothing special is not required
|
||||
if ((layer->insData[TOPK_DATA].lock()->getTensorDesc().getPrecision() != Precision::FP32 &&
|
||||
layer->insData[TOPK_DATA].lock()->getTensorDesc().getPrecision() != Precision::BF16) ||
|
||||
layer->insData[TOPK_K].lock()->getTensorDesc().getPrecision() != Precision::I32)
|
||||
THROW_IE_EXCEPTION << layer->name << " TopKImpl - Incorrect input data/index values precision.";
|
||||
|
||||
if (layer->insData[TOPK_K].lock()->getTensorDesc().getDims().size() > 1)
|
||||
THROW_IE_EXCEPTION << layer->name << " TopKImpl - Index vector should be 1 dimension";
|
||||
|
||||
@ -47,10 +39,6 @@ public:
|
||||
THROW_IE_EXCEPTION << layer->name << " TopKImpl - Incorrect input/output tensor dimension sizes";
|
||||
|
||||
if (layer->outData.size() == 2) {
|
||||
if (layer->outData[TOPK_VALUE]->getTensorDesc().getPrecision() != Precision::FP32 &&
|
||||
layer->outData[TOPK_VALUE]->getTensorDesc().getPrecision() != Precision::BF16)
|
||||
THROW_IE_EXCEPTION << layer->name << " TopKImpl - Incorrect output data tensor precision. Floating point datatypes are supported!";
|
||||
|
||||
SizeVector dst_idx_dims = layer->outData[TOPK_INDEX]->getTensorDesc().getDims();
|
||||
if (dst_dims.size() != dst_idx_dims.size())
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect output tensor dimension sizes";
|
||||
@ -102,11 +90,11 @@ public:
|
||||
before_num = count(src_dims, 0, axis);
|
||||
|
||||
if (layer->outData.size() == 1) {
|
||||
addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) },
|
||||
addConfig(layer, { DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::I32) },
|
||||
{ DataConfigurator(ConfLayout::PLN) });
|
||||
} else {
|
||||
addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) },
|
||||
{ DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) });
|
||||
addConfig(layer, { DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::I32) },
|
||||
{ DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN) });
|
||||
|
||||
// TODO: WA... While ICNNNetwork has no clear rule to fill tensor precision
|
||||
// it use precision of parent layer. So each output tensor Data object has
|
||||
|
@ -39,8 +39,8 @@ public:
|
||||
max_rois_num_ = layer->GetParamAsInt("max_rois", 0);
|
||||
|
||||
addConfig(layer,
|
||||
{DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)},
|
||||
{DataConfigurator(ConfLayout::PLN)});
|
||||
{DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32)},
|
||||
{DataConfigurator(ConfLayout::PLN, Precision::FP32)});
|
||||
} catch (InferenceEngine::details::InferenceEngineException &ex) {
|
||||
errorMsg = ex.what();
|
||||
}
|
||||
|
@ -61,20 +61,12 @@ public:
|
||||
// check dimensions of output tensors and its precisions
|
||||
size_t cur_output_port = 0;
|
||||
SizeVector output_uniques_dims = layer->outData[cur_output_port]->getTensorDesc().getDims();
|
||||
Precision output_uniques_precision = layer->outData[cur_output_port]->getTensorDesc().getPrecision();
|
||||
if (output_uniques_precision != Precision::FP32) {
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect precision for output tensor of unique elements. Only FP32 is supported!";
|
||||
}
|
||||
if (output_uniques_dims.size() != 1 || output_uniques_dims[0] != num_elements) {
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect dimensions for output tensor of unique elements.";
|
||||
}
|
||||
if (return_inverse) {
|
||||
cur_output_port++;
|
||||
SizeVector output_indices_dims = layer->outData[cur_output_port]->getTensorDesc().getDims();
|
||||
Precision output_indices_precision = layer->outData[cur_output_port]->getTensorDesc().getPrecision();
|
||||
if (output_indices_precision != Precision::FP32) {
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect precision for output tensor of indices. Only FP32 is supported!";
|
||||
}
|
||||
if (output_indices_dims.size() != 1 || output_indices_dims[0] != num_elements) {
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect dimensions for output tensor of indices.";
|
||||
}
|
||||
@ -82,10 +74,6 @@ public:
|
||||
if (return_counts) {
|
||||
cur_output_port++;
|
||||
SizeVector output_counts_dims = layer->outData[cur_output_port]->getTensorDesc().getDims();
|
||||
Precision output_counts_precision = layer->outData[cur_output_port]->getTensorDesc().getPrecision();
|
||||
if (output_counts_precision != Precision::FP32) {
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect precision for output tensor of counts. Only FP32 is supported!";
|
||||
}
|
||||
if (output_counts_dims.size() != 1 || output_counts_dims[0] != num_elements) {
|
||||
THROW_IE_EXCEPTION << layer->name << " Incorrect dimensions for output tensor of counts.";
|
||||
}
|
||||
@ -94,16 +82,16 @@ public:
|
||||
// add a layer configuration
|
||||
if (layer->outData.size() == 1) {
|
||||
addConfig(layer,
|
||||
{ DataConfigurator(ConfLayout::PLN) },
|
||||
{ DataConfigurator(ConfLayout::PLN) });
|
||||
{ DataConfigurator(ConfLayout::PLN, Precision::FP32) },
|
||||
{ DataConfigurator(ConfLayout::PLN, Precision::FP32) });
|
||||
} else if (layer->outData.size() == 2) {
|
||||
addConfig(layer,
|
||||
{ DataConfigurator(ConfLayout::PLN) },
|
||||
{ DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) });
|
||||
{ DataConfigurator(ConfLayout::PLN, Precision::FP32) },
|
||||
{ DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32) });
|
||||
} else if (layer->outData.size() == 3) {
|
||||
addConfig(layer,
|
||||
{ DataConfigurator(ConfLayout::PLN) },
|
||||
{ DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) });
|
||||
{ DataConfigurator(ConfLayout::PLN, Precision::FP32) }, { DataConfigurator(ConfLayout::PLN, Precision::FP32),
|
||||
DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32) });
|
||||
}
|
||||
}
|
||||
catch (InferenceEngine::details::InferenceEngineException &ex) {
|
||||
|
141
inference-engine/src/mkldnn_plugin/utils/bfloat16.hpp
Normal file
141
inference-engine/src/mkldnn_plugin/utils/bfloat16.hpp
Normal file
@ -0,0 +1,141 @@
|
||||
// Copyright (C) 2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
|
||||
/**
|
||||
* The bfloat16_t class can be used as an arithmetic type. All arithmetic operations goes through conversion to the float data type.
|
||||
*/
|
||||
|
||||
|
||||
#define BFLOAT16_ROUND_MODE_TRUNCATE
|
||||
|
||||
namespace MKLDNNPlugin {
|
||||
class bfloat16_t {
|
||||
public:
|
||||
constexpr bfloat16_t()
|
||||
: m_value{0}
|
||||
{
|
||||
}
|
||||
bfloat16_t(float value) noexcept
|
||||
: m_value{
|
||||
#if defined BFLOAT16_ROUND_MODE_TO_NEAREST
|
||||
round_to_nearest(value)
|
||||
#elif defined BFLOAT16_ROUND_MODE_TO_NEAREST_EVEN
|
||||
round_to_nearest_even(value)
|
||||
#elif defined BFLOAT16_ROUND_MODE_TRUNCATE
|
||||
truncate(value)
|
||||
#else
|
||||
#error \
|
||||
"ROUNDING_MODE must be one of BFLOAT16_ROUND_MODE_TO_NEAREST, BFLOAT16_ROUND_MODE_TO_NEAREST_EVEN, or BFLOAT16_ROUND_MODE_TRUNCATE"
|
||||
#endif
|
||||
}
|
||||
{
|
||||
}
|
||||
|
||||
operator float() const {
|
||||
return F32{uint32_t(m_value) << 16}.vfloat;
|
||||
}
|
||||
static constexpr bfloat16_t from_bits(uint16_t bits) { return bfloat16_t(bits, true); }
|
||||
uint16_t to_bits() const { return m_value; }
|
||||
|
||||
static inline uint16_t round_to_nearest_even(float x) {
|
||||
return static_cast<uint16_t>((F32(x).vint + ((F32(x).vint & 0x00010000) >> 1)) >> 16);
|
||||
}
|
||||
|
||||
static inline uint16_t round_to_nearest(float x) {
|
||||
return static_cast<uint16_t>((F32(x).vint + 0x8000) >> 16);
|
||||
}
|
||||
|
||||
static inline uint16_t truncate(float x) { return static_cast<uint16_t>((F32(x).vint) >> 16); }
|
||||
|
||||
private:
|
||||
constexpr bfloat16_t(uint16_t x, bool)
|
||||
: m_value{x}
|
||||
{
|
||||
}
|
||||
union alignas(16) F32 {
|
||||
F32(float val)
|
||||
: vfloat{val} {
|
||||
}
|
||||
|
||||
F32(uint32_t val)
|
||||
: vint{val} {
|
||||
}
|
||||
float vfloat;
|
||||
uint32_t vint;
|
||||
};
|
||||
uint16_t m_value;
|
||||
};
|
||||
} // namespace MKLDNNPlugin
|
||||
|
||||
/**
|
||||
* std::numeric_limits overloaded for better compatibility with template metaprogramming.
|
||||
* For example, to make the following template work:
|
||||
* template <typename T>
|
||||
* void someFunction() {
|
||||
* ...
|
||||
* T maxValue = std::numeric_limits<T>::max();
|
||||
* ...
|
||||
* }
|
||||
*/
|
||||
|
||||
namespace std {
|
||||
template <>
|
||||
class numeric_limits<MKLDNNPlugin::bfloat16_t> {
|
||||
public:
|
||||
static constexpr bool is_specialized = true;
|
||||
static constexpr MKLDNNPlugin::bfloat16_t min() noexcept {
|
||||
return MKLDNNPlugin::bfloat16_t::from_bits(0x007F);
|
||||
}
|
||||
static constexpr MKLDNNPlugin::bfloat16_t max() noexcept {
|
||||
return MKLDNNPlugin::bfloat16_t::from_bits(0x7F7F);
|
||||
}
|
||||
static constexpr MKLDNNPlugin::bfloat16_t lowest() noexcept {
|
||||
return MKLDNNPlugin::bfloat16_t::from_bits(0xFF7F);
|
||||
}
|
||||
static constexpr int digits = 7;
|
||||
static constexpr int digits10 = 2;
|
||||
static constexpr bool is_signed = true;
|
||||
static constexpr bool is_integer = false;
|
||||
static constexpr bool is_exact = false;
|
||||
static constexpr int radix = 2;
|
||||
static constexpr MKLDNNPlugin::bfloat16_t epsilon() noexcept {
|
||||
return MKLDNNPlugin::bfloat16_t::from_bits(0x3C00);
|
||||
}
|
||||
static constexpr MKLDNNPlugin::bfloat16_t round_error() noexcept {
|
||||
return MKLDNNPlugin::bfloat16_t::from_bits(0x3F00);
|
||||
}
|
||||
static constexpr int min_exponent = -125;
|
||||
static constexpr int min_exponent10 = -37;
|
||||
static constexpr int max_exponent = 128;
|
||||
static constexpr int max_exponent10 = 38;
|
||||
static constexpr bool has_infinity = true;
|
||||
static constexpr bool has_quiet_NaN = true;
|
||||
static constexpr bool has_signaling_NaN = true;
|
||||
static constexpr float_denorm_style has_denorm = denorm_absent;
|
||||
static constexpr bool has_denorm_loss = false;
|
||||
static constexpr MKLDNNPlugin::bfloat16_t infinity() noexcept {
|
||||
return MKLDNNPlugin::bfloat16_t::from_bits(0x7F80);
|
||||
}
|
||||
static constexpr MKLDNNPlugin::bfloat16_t quiet_NaN() noexcept {
|
||||
return MKLDNNPlugin::bfloat16_t::from_bits(0x7FC0);
|
||||
}
|
||||
static constexpr MKLDNNPlugin::bfloat16_t signaling_NaN() noexcept {
|
||||
return MKLDNNPlugin::bfloat16_t::from_bits(0x7FC0);
|
||||
}
|
||||
static constexpr MKLDNNPlugin::bfloat16_t denorm_min() noexcept {
|
||||
return MKLDNNPlugin::bfloat16_t::from_bits(0);
|
||||
}
|
||||
static constexpr bool is_iec559 = false;
|
||||
static constexpr bool is_bounded = false;
|
||||
static constexpr bool is_modulo = false;
|
||||
static constexpr bool traps = false;
|
||||
static constexpr bool tinyness_before = false;
|
||||
static constexpr float_round_style round_style = round_to_nearest;
|
||||
};
|
||||
} // namespace std
|
@ -30,7 +30,7 @@ protected:
|
||||
std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision) override {
|
||||
// + Power1(FP32)
|
||||
// |
|
||||
// + AvgPooling1(FP32)
|
||||
// + AvgPooling1(BF16)
|
||||
// |
|
||||
// + Convolution1(BF16)
|
||||
// |
|
||||
@ -45,7 +45,7 @@ protected:
|
||||
// | /
|
||||
// ReLU3 (Fused to Conv2) /
|
||||
// | /
|
||||
// MaxPooling1 (FP32) /
|
||||
// MaxPooling1 (BF16) /
|
||||
// \ /
|
||||
// Eltwise
|
||||
// |
|
||||
@ -180,7 +180,7 @@ protected:
|
||||
// filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
|
||||
// performance counters
|
||||
expectedPrecisions["Power1"] = "FP32";
|
||||
expectedPrecisions["AvgPooling1"] = "FP32";
|
||||
expectedPrecisions["AvgPooling1"] = "BF16";
|
||||
expectedPrecisions["Convolution1"] = "BF16";
|
||||
expectedPrecisions["ReLU1"] = "ndef";
|
||||
expectedPrecisions["Convolution2"] = "BF16";
|
||||
@ -189,7 +189,7 @@ protected:
|
||||
expectedPrecisions["Norm1"] = "FP32";
|
||||
expectedPrecisions["Eltwise1"] = "ndef";
|
||||
expectedPrecisions["ReLU3"] = "ndef";
|
||||
expectedPrecisions["maxPooling1"] = "FP32";
|
||||
expectedPrecisions["maxPooling1"] = "BF16";
|
||||
expectedPrecisions["Eltwise2"] = "FP32";
|
||||
}
|
||||
};
|
||||
|
@ -131,7 +131,7 @@ protected:
|
||||
expectedPrecisions["ADD_1"] = "FP32";
|
||||
expectedPrecisions["CONV_1"] = "BF16";
|
||||
expectedPrecisions["CONV_2"] = "BF16";
|
||||
expectedPrecisions["CONC_1_TEST"] = "FP32";
|
||||
expectedPrecisions["CONC_1_TEST"] = "BF16";
|
||||
expectedPrecisions["RELU_1"] = "FP32";
|
||||
}
|
||||
};
|
||||
|
@ -32,7 +32,7 @@ protected:
|
||||
// |
|
||||
// ReLU1 (Fused)
|
||||
// |
|
||||
// Pooling1 (FP32)
|
||||
// Pooling1 (BF16)
|
||||
// |
|
||||
// Convolution2 (BF16)
|
||||
// |
|
||||
@ -164,7 +164,7 @@ protected:
|
||||
// performance counters
|
||||
expectedPrecisions["Convolution_1"] = "FP32";
|
||||
expectedPrecisions["ReLU_1"] = "ndef";
|
||||
expectedPrecisions["AvgPool_1"] = "FP32";
|
||||
expectedPrecisions["AvgPool_1"] = "BF16";
|
||||
expectedPrecisions["Convolution_2"] = "BF16";
|
||||
expectedPrecisions["ReLU_2"] = "ndef";
|
||||
expectedPrecisions["MaxPool_2"] = "BF16";
|
||||
|
@ -37,7 +37,7 @@ protected:
|
||||
// \ / /
|
||||
// Mul(FP32) ReLU(FP32)
|
||||
// \ /
|
||||
// Concat(FP32) Const
|
||||
// Concat(BF16) Const
|
||||
// \ /
|
||||
// Matmul(BF16)
|
||||
|
||||
@ -116,7 +116,7 @@ protected:
|
||||
fnPtr = createGraph(netPrecision);
|
||||
|
||||
// STAGE2: set up safe threshold <= 5% from maximum value of output tensor
|
||||
threshold = 170.02f; // Max in fp32 network by output: 3887.11
|
||||
threshold = 177.f; // Max in fp32 network by output: 3887.11
|
||||
|
||||
// STAGE3:
|
||||
// filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
|
||||
@ -125,7 +125,7 @@ protected:
|
||||
expectedPrecisions["Mul_1"] = "FP32";
|
||||
expectedPrecisions["Add_1"] = "FP32";
|
||||
expectedPrecisions["Relu_1"] = "FP32";
|
||||
expectedPrecisions["Conc_1"] = "FP32";
|
||||
expectedPrecisions["Conc_1"] = "BF16";
|
||||
expectedPrecisions["Matmul_1"] = "BF16";
|
||||
}
|
||||
};
|
||||
|
@ -24,7 +24,7 @@ protected:
|
||||
// |
|
||||
// Conv1 (FP32)
|
||||
// | \
|
||||
// Conv2 (FP32 so far while we have not greedy mode. This must be fixed. Such pattern shouild have Conv2 in BF16)
|
||||
// Conv2 (BF16) \
|
||||
// | |
|
||||
// relu(fused) |
|
||||
// | Normalize (not LRN)
|
||||
@ -145,18 +145,18 @@ protected:
|
||||
fnPtr = createGraph(netPrecision);
|
||||
|
||||
// STAGE1:
|
||||
threshold = 0.8f; // max value in latest tensor is 87.67
|
||||
threshold = 0.85f; // max value in latest tensor is 87.67
|
||||
// STAGE2:
|
||||
// filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
|
||||
// performance counters
|
||||
expectedPrecisions["ADD_1"] = "FP32";
|
||||
expectedPrecisions["CONV_1"] = "BF16";
|
||||
expectedPrecisions["CONV_2"] = "FP32";
|
||||
expectedPrecisions["CONV_2"] = "BF16";
|
||||
expectedPrecisions["RELU_2"] = "ndef";
|
||||
expectedPrecisions["DW_CONV"] = "BF16";
|
||||
expectedPrecisions["RELU_DW"] = "ndef";
|
||||
expectedPrecisions["NORM_1"] = "FP32";
|
||||
expectedPrecisions["CONC_1"] = "FP32";
|
||||
expectedPrecisions["CONC_1"] = "BF16";
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -93,7 +93,7 @@ protected:
|
||||
fnPtr = createGraph(netPrecision);
|
||||
|
||||
// STAGE1:
|
||||
threshold = 5e-2;
|
||||
threshold = 7e-2;
|
||||
// STAGE2:
|
||||
// filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
|
||||
// performance counters
|
||||
|
@ -117,7 +117,7 @@ protected:
|
||||
expectedPrecisions["ADD_1"] = "FP32";
|
||||
expectedPrecisions["CONV_1"] = "BF16";
|
||||
expectedPrecisions["CONV_2"] = "BF16";
|
||||
expectedPrecisions["CONC_1"] = "FP32";
|
||||
expectedPrecisions["CONC_1"] = "BF16";
|
||||
expectedPrecisions["RELU_1"] = "FP32";
|
||||
}
|
||||
};
|
||||
|
@ -142,7 +142,7 @@ protected:
|
||||
fnPtr = createGraph(netPrecision);
|
||||
|
||||
// STAGE1:
|
||||
threshold = 2e-1;
|
||||
threshold = 5e-1;
|
||||
|
||||
// STAGE2:
|
||||
// filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
|
||||
|
@ -5,4 +5,9 @@
|
||||
#include "functional_test_utils/plugin_config.hpp"
|
||||
|
||||
void PreparePluginConfiguration(LayerTestsUtils::LayerTestsCommon* test) {
|
||||
// Within the test scope we don't need any implicit bf16 optimisations, so let's run the network as is.
|
||||
auto& configuration = test->GetConfiguration();
|
||||
if (!configuration.count(InferenceEngine::PluginConfigParams::KEY_ENFORCE_BF16)) {
|
||||
configuration.insert({InferenceEngine::PluginConfigParams::KEY_ENFORCE_BF16, InferenceEngine::PluginConfigParams::NO});
|
||||
}
|
||||
}
|
||||
|
@ -5,10 +5,11 @@
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include <ie_system_conf.h>
|
||||
#include "functional_test_utils/skip_tests_config.hpp"
|
||||
|
||||
std::vector<std::string> disabledTestPatterns() {
|
||||
return {
|
||||
std::vector<std::string> retVector{
|
||||
// TODO: Issue 26264
|
||||
R"(.*(MaxPool|AvgPool).*S\(1\.2\).*Rounding=ceil.*)",
|
||||
// TODO: Issue 31841
|
||||
@ -58,4 +59,12 @@ std::vector<std::string> disabledTestPatterns() {
|
||||
// TODO: Issue 43417 sporadic issue, looks like an issue in test, reproducible only on Windows platform
|
||||
R"(.*decomposition1_batch=5_hidden_size=10_input_size=30_.*tanh.relu.*_clip=0_linear_before_reset=1.*_targetDevice=CPU_.*)",
|
||||
};
|
||||
|
||||
if (!InferenceEngine::with_cpu_x86_bfloat16()) {
|
||||
// on platforms which do not support bfloat16, we are disabling bf16 tests since there are no bf16 primitives,
|
||||
// tests are useless on such platforms
|
||||
retVector.emplace_back(R"(.*BF16.*)");
|
||||
}
|
||||
|
||||
return retVector;
|
||||
}
|
||||
|
@ -0,0 +1,143 @@
|
||||
// Copyright (C) 2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <single_layer_tests/activation.hpp>
|
||||
#include "test_utils/cpu_test_utils.hpp"
|
||||
|
||||
using namespace InferenceEngine;
|
||||
using namespace CPUTestUtils;
|
||||
using namespace ngraph::helpers;
|
||||
|
||||
namespace CPULayerTestsDefinitions {
|
||||
|
||||
typedef std::tuple<
|
||||
LayerTestsDefinitions::activationParams,
|
||||
CPUSpecificParams>
|
||||
ActivationLayerCPUTestParamSet;
|
||||
|
||||
class ActivationLayerCPUTest : public testing::WithParamInterface<ActivationLayerCPUTestParamSet>,
|
||||
virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase {
|
||||
public:
|
||||
ActivationTypes activationType;
|
||||
static std::string getTestCaseName(const testing::TestParamInfo<ActivationLayerCPUTestParamSet> &obj) {
|
||||
LayerTestsDefinitions::activationParams basicParamsSet;
|
||||
CPUSpecificParams cpuParams;
|
||||
std::tie(basicParamsSet, cpuParams) = obj.param;
|
||||
|
||||
std::ostringstream result;
|
||||
result << LayerTestsDefinitions::ActivationLayerTest::getTestCaseName(testing::TestParamInfo<LayerTestsDefinitions::activationParams>(
|
||||
basicParamsSet, 0));
|
||||
|
||||
result << CPUTestsBase::getTestCaseName(cpuParams);
|
||||
|
||||
return result.str();
|
||||
}
|
||||
InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo &info) const override {
|
||||
return FuncTestUtils::createAndFillBlob(info.getTensorDesc(), 15, 0, 32768);
|
||||
}
|
||||
|
||||
protected:
|
||||
void SetUp() override {
|
||||
LayerTestsDefinitions::activationParams basicParamsSet;
|
||||
CPUSpecificParams cpuParams;
|
||||
std::tie(basicParamsSet, cpuParams) = this->GetParam();
|
||||
|
||||
std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
|
||||
|
||||
InferenceEngine::Precision netPrecision;
|
||||
std::pair<std::vector<size_t>, std::vector<size_t>> shapes;
|
||||
std::pair<ActivationTypes, std::vector<float>> activationDecl;
|
||||
std::tie(activationDecl, netPrecision, inPrc, outPrc, inLayout, outLayout, shapes, targetDevice) = basicParamsSet;
|
||||
selectedType = getPrimitiveType() + "_" + inPrc.name();
|
||||
|
||||
activationType = activationDecl.first;
|
||||
auto constantsValue = activationDecl.second;
|
||||
auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
|
||||
auto params = ngraph::builder::makeParams(ngPrc, {shapes.first});
|
||||
auto activation = ngraph::builder::makeActivation(params[0], ngPrc, activationType, shapes.second, constantsValue);
|
||||
activation->get_rt_info() = getCPUInfo();
|
||||
function = std::make_shared<ngraph::Function>(ngraph::NodeVector{activation}, params, "Activation");
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(ActivationLayerCPUTest, CompareWithRefs) {
|
||||
SKIP_IF_CURRENT_TEST_IS_DISABLED()
|
||||
|
||||
Run();
|
||||
CheckCPUImpl(executableNetwork, "Eltwise");
|
||||
}
|
||||
|
||||
|
||||
namespace {
|
||||
// list only types supported by eltwise
|
||||
const std::map<ActivationTypes, std::vector<std::vector<float>>> activationTypes = {
|
||||
{Sqrt, {{}}},
|
||||
{Sigmoid, {{}}},
|
||||
{Tanh, {{}}},
|
||||
{Relu, {{}}},
|
||||
{Gelu, {{}}},
|
||||
{Exp, {{}}},
|
||||
{Clamp, {{-2.0f, 2.0f}}},
|
||||
{Elu, {{0.1f}}},
|
||||
{Swish, {{0.1f}}},
|
||||
{HSwish, {{}}},
|
||||
{Mish, {{}}},
|
||||
{PReLu, {{-0.01f}}}
|
||||
};
|
||||
|
||||
std::vector<CPUSpecificParams> cpuParams_4D = {
|
||||
CPUSpecificParams({nChw16c}, {nChw16c}, {}, {}),
|
||||
CPUSpecificParams({nhwc}, {nhwc}, {}, {}),
|
||||
CPUSpecificParams({nchw}, {nchw}, {}, {})
|
||||
};
|
||||
|
||||
std::map<std::vector<size_t>, std::vector<std::vector<size_t>>> basic4D = {
|
||||
{{2, 4, 4, 1}, {{}}},
|
||||
{{2, 17, 5, 4}, {{}}},
|
||||
};
|
||||
|
||||
std::vector<Precision> bf16InpOutPrc = {Precision::BF16, Precision::FP32};
|
||||
|
||||
const auto basicCases4D = ::testing::Combine(
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(CommonTestUtils::combineParams(activationTypes)),
|
||||
::testing::Values(Precision::BF16),
|
||||
::testing::ValuesIn(bf16InpOutPrc),
|
||||
::testing::ValuesIn(bf16InpOutPrc),
|
||||
::testing::Values(InferenceEngine::Layout::ANY),
|
||||
::testing::Values(InferenceEngine::Layout::ANY),
|
||||
::testing::ValuesIn(CommonTestUtils::combineParams(basic4D)),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D))
|
||||
);
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_Activation4D_Eltwise_CPU_BF16, ActivationLayerCPUTest, basicCases4D, ActivationLayerCPUTest::getTestCaseName);
|
||||
|
||||
std::vector<CPUSpecificParams> cpuParams_5D = {
|
||||
CPUSpecificParams({nCdhw16c}, {nCdhw16c}, {}, {}),
|
||||
CPUSpecificParams({ndhwc}, {ndhwc}, {}, {}),
|
||||
CPUSpecificParams({ncdhw}, {ncdhw}, {}, {})
|
||||
};
|
||||
|
||||
std::map<std::vector<size_t>, std::vector<std::vector<size_t>>> basic5D = {
|
||||
{{2, 4, 3, 4, 1}, {{}}},
|
||||
{{2, 17, 7, 5, 4}, {{}}},
|
||||
};
|
||||
|
||||
const auto basicCases5D = ::testing::Combine(
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(CommonTestUtils::combineParams(activationTypes)),
|
||||
::testing::Values(Precision::BF16),
|
||||
::testing::ValuesIn(bf16InpOutPrc),
|
||||
::testing::ValuesIn(bf16InpOutPrc),
|
||||
::testing::Values(InferenceEngine::Layout::ANY),
|
||||
::testing::Values(InferenceEngine::Layout::ANY),
|
||||
::testing::ValuesIn(CommonTestUtils::combineParams(basic5D)),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D))
|
||||
);
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_Activation5D_Eltwise_CPU_BF16, ActivationLayerCPUTest, basicCases5D, ActivationLayerCPUTest::getTestCaseName);
|
||||
} // namespace
|
||||
} // namespace CPULayerTestsDefinitions
|
@ -0,0 +1,57 @@
|
||||
// Copyright (C) 2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <single_layer_tests/convert.hpp>
|
||||
|
||||
using namespace LayerTestsDefinitions;
|
||||
using namespace InferenceEngine;
|
||||
|
||||
namespace CPULayerTestsDefinitions {
|
||||
|
||||
class ConvertCPULayerTest : public ConvertLayerTest {};
|
||||
|
||||
TEST_P(ConvertCPULayerTest, CompareWithRefs) {
|
||||
SKIP_IF_CURRENT_TEST_IS_DISABLED()
|
||||
|
||||
ConvertParamsTuple params = GetParam();
|
||||
inPrc = std::get<1>(params);
|
||||
outPrc = std::get<2>(params);
|
||||
|
||||
Run();
|
||||
}
|
||||
|
||||
namespace {
|
||||
const std::vector<std::vector<size_t>> inShape = {{1, 2, 3, 4}};
|
||||
|
||||
// List of precisions natively supported by mkldnn.
|
||||
const std::vector<Precision> precisions = {
|
||||
Precision::U8,
|
||||
Precision::I8,
|
||||
Precision::I16,
|
||||
Precision::I32,
|
||||
Precision::FP32,
|
||||
Precision::BF16
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_ConvertLayerTest_From_BF16, ConvertCPULayerTest,
|
||||
::testing::Combine(
|
||||
::testing::Values(inShape),
|
||||
::testing::Values(Precision::BF16),
|
||||
::testing::ValuesIn(precisions),
|
||||
::testing::Values(Layout::ANY),
|
||||
::testing::Values(Layout::ANY),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
ConvertLayerTest::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_ConvertLayerTest_To_BF16, ConvertCPULayerTest,
|
||||
::testing::Combine(
|
||||
::testing::Values(inShape),
|
||||
::testing::ValuesIn(precisions),
|
||||
::testing::Values(Precision::BF16),
|
||||
::testing::Values(Layout::ANY),
|
||||
::testing::Values(Layout::ANY),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
ConvertLayerTest::getTestCaseName);
|
||||
} // namespace
|
||||
} // namespace CPULayerTestsDefinitions
|
@ -0,0 +1,176 @@
|
||||
// Copyright (C) 2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <single_layer_tests/strided_slice.hpp>
|
||||
#include "ngraph_functions/builders.hpp"
|
||||
#include "test_utils/cpu_test_utils.hpp"
|
||||
|
||||
// Given that the ngraph opset does not contain crop operation, we use the StridedSlice operation instead, since it is mapped to the Crop node if certain
|
||||
// conditions are met.
|
||||
|
||||
using namespace InferenceEngine;
|
||||
using namespace CPUTestUtils;
|
||||
using namespace LayerTestsDefinitions;
|
||||
|
||||
namespace CPULayerTestsDefinitions {
|
||||
|
||||
typedef std::tuple<
|
||||
StridedSliceSpecificParams,
|
||||
InferenceEngine::Precision, // Net precision
|
||||
std::string, // Device name
|
||||
std::map<std::string, std::string>, // Additional network configuration
|
||||
CPUSpecificParams> CropLayerCPUTestParamSet;
|
||||
|
||||
class CropLayerCPUTest : public testing::WithParamInterface<CropLayerCPUTestParamSet>,
|
||||
virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase {
|
||||
public:
|
||||
static std::string getTestCaseName(testing::TestParamInfo<CropLayerCPUTestParamSet> obj) {
|
||||
StridedSliceSpecificParams params;
|
||||
InferenceEngine::Precision netPrc;
|
||||
std::string targetName;
|
||||
std::map<std::string, std::string> additionalConfig;
|
||||
CPUSpecificParams cpuParams;
|
||||
std::tie(params, netPrc, targetName, additionalConfig, cpuParams) = obj.param;
|
||||
|
||||
std::ostringstream result;
|
||||
result << "inShape=" << CommonTestUtils::vec2str(params.inputShape) << "_";
|
||||
result << "netPRC=" << netPrc.name() << "_";
|
||||
result << "begin=" << CommonTestUtils::vec2str(params.begin) << "_";
|
||||
result << "end=" << CommonTestUtils::vec2str(params.end) << "_";
|
||||
result << "stride=" << CommonTestUtils::vec2str(params.strides) << "_";
|
||||
result << "begin_m=" << CommonTestUtils::vec2str(params.beginMask) << "_";
|
||||
result << "end_m=" << CommonTestUtils::vec2str(params.endMask) << "_";
|
||||
if (!params.newAxisMask.empty()) {
|
||||
result << "new_axis_m=" << (params.newAxisMask.empty() ? "def" : CommonTestUtils::vec2str(params.newAxisMask)) << "_";
|
||||
}
|
||||
if (!params.shrinkAxisMask.empty()) {
|
||||
result << "shrink_m=" << (params.shrinkAxisMask.empty() ? "def" : CommonTestUtils::vec2str(params.shrinkAxisMask)) << "_";
|
||||
}
|
||||
if (!params.ellipsisAxisMask.empty()) {
|
||||
result << "ellipsis_m=" << (params.ellipsisAxisMask.empty() ? "def" : CommonTestUtils::vec2str(params.ellipsisAxisMask)) << "_";
|
||||
}
|
||||
result << "trgDev=" << targetName;
|
||||
result << CPUTestsBase::getTestCaseName(cpuParams);
|
||||
|
||||
return result.str();
|
||||
}
|
||||
protected:
|
||||
void SetUp() override {
|
||||
StridedSliceSpecificParams ssParams;
|
||||
InferenceEngine::Precision netPrecision;
|
||||
std::map<std::string, std::string> additionalConfig;
|
||||
CPUSpecificParams cpuParams;
|
||||
std::tie(ssParams, netPrecision, targetDevice, additionalConfig, cpuParams) = this->GetParam();
|
||||
inPrc = outPrc = netPrecision; // because crop does not convert Precisions, but only moves the data
|
||||
std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
|
||||
configuration.insert(additionalConfig.begin(), additionalConfig.end());
|
||||
|
||||
auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
|
||||
auto params = ngraph::builder::makeParams(ngPrc, {ssParams.inputShape});
|
||||
auto paramOuts = ngraph::helpers::convert2OutputVector(
|
||||
ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));
|
||||
auto ss = ngraph::builder::makeStridedSlice(paramOuts[0], ssParams.begin, ssParams.end, ssParams.strides, ngPrc, ssParams.beginMask,
|
||||
ssParams.endMask, ssParams.newAxisMask, ssParams.shrinkAxisMask, ssParams.ellipsisAxisMask);
|
||||
|
||||
selectedType = std::string("unknown_") + inPrc.name();
|
||||
|
||||
ss->get_rt_info() = getCPUInfo();
|
||||
|
||||
ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(ss)};
|
||||
function = std::make_shared<ngraph::Function>(results, params, "StridedSlice");
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(CropLayerCPUTest, CompareWithRefs) {
|
||||
SKIP_IF_CURRENT_TEST_IS_DISABLED()
|
||||
|
||||
Run();
|
||||
CheckCPUImpl(executableNetwork, "Crop");
|
||||
}
|
||||
|
||||
namespace {
|
||||
const std::map<std::string, std::string> additional_config;
|
||||
|
||||
const std::vector<Precision> netPrc = {Precision::BF16, Precision::FP32};
|
||||
|
||||
const std::vector<StridedSliceSpecificParams> testCasesPlain2D = {StridedSliceSpecificParams{ { 32, 32 }, { 0, 20 }, { 32, 30 }, { 1, 1 },
|
||||
{ 0, 0 }, { 0, 0 }, { }, { }, { } },
|
||||
StridedSliceSpecificParams{ { 32, 20 }, { 2, 10 }, { 32, 20 }, { 1, 1 },
|
||||
{ 0, 0 }, { 0, 0 }, { }, { }, { } } };
|
||||
|
||||
const auto CropParamsPlain2D = ::testing::Combine(
|
||||
::testing::ValuesIn(testCasesPlain2D),
|
||||
::testing::ValuesIn(netPrc),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(additional_config),
|
||||
::testing::Values(emptyCPUSpec));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_Plain_2D, CropLayerCPUTest, CropParamsPlain2D, CropLayerCPUTest::getTestCaseName);
|
||||
|
||||
const std::vector<StridedSliceSpecificParams> testCasesPlain4D = {
|
||||
StridedSliceSpecificParams{ { 1, 5, 32, 32 }, { 0, 2, 5, 4 }, { 1, 4, 28, 27 }, { 1, 1, 1, 1 },
|
||||
{ 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { }, { }, { } },
|
||||
StridedSliceSpecificParams{ { 1, 5, 32, 32 }, { 0, 0, 20, 20 }, { 1, 5, 25, 25 }, { 1, 1, 1, 1 },
|
||||
{ 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { }, { }, { } },
|
||||
StridedSliceSpecificParams{ { 1, 5, 32, 32 }, { 0, 0, 0, 20 }, { 1, 5, 32, 30 }, { 1, 1, 1, 1 },
|
||||
{ 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { }, { }, { } },
|
||||
StridedSliceSpecificParams{ { 1, 5, 32, 20 }, { 0, 0, 2, 10 }, { 1, 5, 32, 20 }, { 1, 1, 1, 1 },
|
||||
{ 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { }, { }, { } }
|
||||
};
|
||||
|
||||
std::vector<CPUSpecificParams> cpuParams_4D = {
|
||||
CPUSpecificParams({nChw16c}, {nChw16c}, {}, {}),
|
||||
CPUSpecificParams({nchw}, {nchw}, {}, {})
|
||||
};
|
||||
|
||||
const auto CropParamsPlain4D = ::testing::Combine(
|
||||
::testing::ValuesIn(testCasesPlain4D),
|
||||
::testing::ValuesIn(netPrc),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(additional_config),
|
||||
::testing::Values(cpuParams_4D.at(1)));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_Plain_4D, CropLayerCPUTest, CropParamsPlain4D, CropLayerCPUTest::getTestCaseName);
|
||||
|
||||
const std::vector<StridedSliceSpecificParams> testCasesBlocked4D = {
|
||||
StridedSliceSpecificParams{ { 1, 16, 32, 32 }, { 0, 0, 20, 20 }, { 1, 16, 25, 25 }, { 1, 1, 1, 1 },
|
||||
{ 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { }, { }, { } },
|
||||
StridedSliceSpecificParams{ { 1, 32, 32, 32 }, { 0, 0, 0, 20 }, { 1, 16, 32, 30 }, { 1, 1, 1, 1 },
|
||||
{ 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { }, { }, { } },
|
||||
};
|
||||
|
||||
const auto CropParamsBlocked4D = ::testing::Combine(
|
||||
::testing::ValuesIn(testCasesBlocked4D),
|
||||
::testing::ValuesIn(netPrc),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(additional_config),
|
||||
::testing::Values(filterCPUSpecificParams(cpuParams_4D).front()));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_Blocked_4D, CropLayerCPUTest, CropParamsBlocked4D, CropLayerCPUTest::getTestCaseName);
|
||||
|
||||
const std::vector<StridedSliceSpecificParams> testCasesPlain4DynBatch = {
|
||||
StridedSliceSpecificParams{ { 10, 5, 32, 32 }, { 0, 2, 5, 4 }, { 1, 4, 28, 27 }, { 1, 1, 1, 1 },
|
||||
{ 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { }, { }, { } },
|
||||
StridedSliceSpecificParams{ { 10, 5, 32, 32 }, { 0, 0, 20, 20 }, { 1, 5, 25, 25 }, { 1, 1, 1, 1 },
|
||||
{ 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { }, { }, { } },
|
||||
StridedSliceSpecificParams{ { 10, 5, 32, 32 }, { 0, 0, 0, 20 }, { 1, 5, 32, 30 }, { 1, 1, 1, 1 },
|
||||
{ 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { }, { }, { } },
|
||||
StridedSliceSpecificParams{ { 10, 5, 32, 20 }, { 0, 0, 2, 10 }, { 1, 5, 32, 20 }, { 1, 1, 1, 1 },
|
||||
{ 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { }, { }, { } }
|
||||
};
|
||||
|
||||
std::map<std::string, std::string> additional_config_dyn_batch = {{PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::NO},
|
||||
{PluginConfigParams::KEY_DYN_BATCH_ENABLED, PluginConfigParams::YES}};
|
||||
|
||||
const auto CropParamsPlain4DynBatch = ::testing::Combine(
|
||||
::testing::ValuesIn(testCasesPlain4DynBatch),
|
||||
::testing::ValuesIn(netPrc),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(additional_config_dyn_batch),
|
||||
::testing::Values(cpuParams_4D.at(1)));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_Blocked_4DynBatch, CropLayerCPUTest, CropParamsPlain4DynBatch, CropLayerCPUTest::getTestCaseName);
|
||||
} // namespace
|
||||
} // namespace CPULayerTestsDefinitions
|
||||
|
@ -47,17 +47,7 @@ protected:
|
||||
std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
|
||||
auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
|
||||
|
||||
std::string isaType;
|
||||
if (with_cpu_x86_avx512f()) {
|
||||
isaType = "jit_avx512";
|
||||
} else if (with_cpu_x86_avx2()) {
|
||||
isaType = "jit_avx2";
|
||||
} else if (with_cpu_x86_sse42()) {
|
||||
isaType = "jit_sse42";
|
||||
} else {
|
||||
isaType = "ref";
|
||||
}
|
||||
selectedType = isaType + "_" + "FP32";
|
||||
selectedType = getPrimitiveType() + "_" + inPrc.name();
|
||||
|
||||
std::vector<size_t> inputShape1, inputShape2;
|
||||
if (inputShapes.size() == 1) {
|
||||
@ -90,12 +80,7 @@ protected:
|
||||
eltwiseType == ngraph::helpers::EltwiseTypes::FLOOR_MOD ||
|
||||
eltwiseType == ngraph::helpers::EltwiseTypes::MOD) {
|
||||
std::vector<float> data(ngraph::shape_size(shape_input_secondary));
|
||||
data = NGraphFunctions::Utils::generateVector<ngraph::element::Type_t::f32>(ngraph::shape_size(shape_input_secondary));
|
||||
for (float &i : data) {
|
||||
if (i == 0) {
|
||||
i = 1;
|
||||
}
|
||||
}
|
||||
data = NGraphFunctions::Utils::generateVector<ngraph::element::Type_t::f32>(ngraph::shape_size(shape_input_secondary), 10, 2);
|
||||
secondaryInput = ngraph::builder::makeConstant(ngPrc, shape_input_secondary, data);
|
||||
} else {
|
||||
secondaryInput = ngraph::builder::makeInputLayer(ngPrc, secondaryInputType, shape_input_secondary);
|
||||
@ -105,7 +90,7 @@ protected:
|
||||
}
|
||||
|
||||
auto eltwise = ngraph::builder::makeEltwise(input[0], secondaryInput, eltwiseType);
|
||||
eltwise->get_rt_info() = CPUTestsBase::setCPUInfo(inFmts, outFmts, priority);
|
||||
eltwise->get_rt_info() = getCPUInfo();
|
||||
function = std::make_shared<ngraph::Function>(eltwise, input, "Eltwise");
|
||||
}
|
||||
};
|
||||
@ -114,7 +99,7 @@ TEST_P(EltwiseLayerCPUTest, CompareWithRefs) {
|
||||
SKIP_IF_CURRENT_TEST_IS_DISABLED()
|
||||
|
||||
Run();
|
||||
CheckCPUImpl(executableNetwork, "Eltwise", inFmts, outFmts, selectedType);
|
||||
CheckCPUImpl(executableNetwork, "Eltwise");
|
||||
}
|
||||
|
||||
namespace {
|
||||
@ -128,7 +113,7 @@ std::vector<CommonTestUtils::OpType> opTypes = {
|
||||
CommonTestUtils::OpType::VECTOR,
|
||||
};
|
||||
|
||||
std::vector<ngraph::helpers::EltwiseTypes> eltwiseOpTypes = {
|
||||
std::vector<ngraph::helpers::EltwiseTypes> eltwiseOpTypesBinInp = {
|
||||
ngraph::helpers::EltwiseTypes::ADD,
|
||||
ngraph::helpers::EltwiseTypes::MULTIPLY,
|
||||
// TODO: Disabled because memory formats filter is not propogated through ngraph transformations
|
||||
@ -138,27 +123,15 @@ std::vector<ngraph::helpers::EltwiseTypes> eltwiseOpTypes = {
|
||||
ngraph::helpers::EltwiseTypes::SQUARED_DIFF,
|
||||
};
|
||||
|
||||
std::map<std::string, std::string> additional_config = {};
|
||||
std::vector<ngraph::helpers::EltwiseTypes> eltwiseOpTypesDiffInp = { // Different number of input nodes depending on optimizations
|
||||
ngraph::helpers::EltwiseTypes::POWER,
|
||||
// ngraph::helpers::EltwiseTypes::MOD // Does not execute because of transformations
|
||||
};
|
||||
|
||||
std::vector<CPUSpecificParams> filterCPUSpecificParams(std::vector<CPUSpecificParams>& paramsVector) {
|
||||
auto adjustBlockedFormatByIsa = [](std::vector<cpu_memory_format_t>& formats) {
|
||||
for (int i = 0; i < formats.size(); i++) {
|
||||
if (formats[i] == nChw16c)
|
||||
formats[i] = nChw8c;
|
||||
if (formats[i] == nCdhw16c)
|
||||
formats[i] = nCdhw8c;
|
||||
}
|
||||
};
|
||||
std::map<std::string, std::string> additional_config;
|
||||
|
||||
if (!with_cpu_x86_avx512f()) {
|
||||
for (auto& param : paramsVector) {
|
||||
adjustBlockedFormatByIsa(std::get<0>(param));
|
||||
adjustBlockedFormatByIsa(std::get<1>(param));
|
||||
}
|
||||
}
|
||||
std::vector<Precision> bf16InpOutPrc = {Precision::BF16, Precision::FP32};
|
||||
|
||||
return paramsVector;
|
||||
}
|
||||
|
||||
std::vector<std::vector<std::vector<size_t>>> inShapes_4D = {
|
||||
{{2, 4, 4, 1}},
|
||||
@ -176,19 +149,50 @@ std::vector<CPUSpecificParams> cpuParams_4D = {
|
||||
const auto params_4D_FP32 = ::testing::Combine(
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(inShapes_4D),
|
||||
::testing::ValuesIn(eltwiseOpTypes),
|
||||
::testing::ValuesIn(eltwiseOpTypesBinInp),
|
||||
::testing::ValuesIn(secondaryInputTypes),
|
||||
::testing::ValuesIn(opTypes),
|
||||
::testing::Values(InferenceEngine::Precision::FP32),
|
||||
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
|
||||
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
|
||||
::testing::Values(InferenceEngine::Precision::FP32),
|
||||
::testing::Values(InferenceEngine::Precision::FP32),
|
||||
::testing::Values(InferenceEngine::Layout::ANY),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(additional_config)),
|
||||
::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D)));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_4D_FP32, EltwiseLayerCPUTest, params_4D_FP32, EltwiseLayerCPUTest::getTestCaseName);
|
||||
INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_4D_FP32_MemOrder, EltwiseLayerCPUTest, params_4D_FP32, EltwiseLayerCPUTest::getTestCaseName);
|
||||
|
||||
const auto params_4D_BF16 = ::testing::Combine(
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(inShapes_4D),
|
||||
::testing::ValuesIn(eltwiseOpTypesBinInp),
|
||||
::testing::ValuesIn(secondaryInputTypes),
|
||||
::testing::ValuesIn(opTypes),
|
||||
::testing::Values(InferenceEngine::Precision::BF16),
|
||||
::testing::ValuesIn(bf16InpOutPrc),
|
||||
::testing::ValuesIn(bf16InpOutPrc),
|
||||
::testing::Values(InferenceEngine::Layout::ANY),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(additional_config)),
|
||||
::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D)));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_4D_BF16_MemOrder, EltwiseLayerCPUTest, params_4D_BF16, EltwiseLayerCPUTest::getTestCaseName);
|
||||
|
||||
const auto params_4D_BF16_emptyCPUSpec = ::testing::Combine(
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(inShapes_4D),
|
||||
::testing::ValuesIn(eltwiseOpTypesDiffInp),
|
||||
::testing::ValuesIn(secondaryInputTypes),
|
||||
::testing::ValuesIn(opTypes),
|
||||
::testing::Values(InferenceEngine::Precision::BF16),
|
||||
::testing::ValuesIn(bf16InpOutPrc),
|
||||
::testing::ValuesIn(bf16InpOutPrc),
|
||||
::testing::Values(InferenceEngine::Layout::ANY),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(additional_config)),
|
||||
::testing::Values(emptyCPUSpec));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_4D_BF16, EltwiseLayerCPUTest, params_4D_BF16_emptyCPUSpec, EltwiseLayerCPUTest::getTestCaseName);
|
||||
|
||||
std::vector<std::vector<std::vector<size_t>>> inShapes_5D = {
|
||||
{{2, 4, 3, 4, 1}},
|
||||
@ -206,19 +210,50 @@ std::vector<CPUSpecificParams> cpuParams_5D = {
|
||||
const auto params_5D_FP32 = ::testing::Combine(
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(inShapes_5D),
|
||||
::testing::ValuesIn(eltwiseOpTypes),
|
||||
::testing::ValuesIn(eltwiseOpTypesBinInp),
|
||||
::testing::ValuesIn(secondaryInputTypes),
|
||||
::testing::ValuesIn(opTypes),
|
||||
::testing::Values(InferenceEngine::Precision::FP32),
|
||||
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
|
||||
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
|
||||
::testing::Values(InferenceEngine::Precision::FP32),
|
||||
::testing::Values(InferenceEngine::Precision::FP32),
|
||||
::testing::Values(InferenceEngine::Layout::ANY),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(additional_config)),
|
||||
::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D)));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_5D_FP32, EltwiseLayerCPUTest, params_5D_FP32, EltwiseLayerCPUTest::getTestCaseName);
|
||||
INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_5D_FP32_MemOrder, EltwiseLayerCPUTest, params_5D_FP32, EltwiseLayerCPUTest::getTestCaseName);
|
||||
|
||||
const auto params_5D_BF16 = ::testing::Combine(
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(inShapes_5D),
|
||||
::testing::ValuesIn(eltwiseOpTypesBinInp),
|
||||
::testing::ValuesIn(secondaryInputTypes),
|
||||
::testing::ValuesIn(opTypes),
|
||||
::testing::Values(InferenceEngine::Precision::BF16),
|
||||
::testing::ValuesIn(bf16InpOutPrc),
|
||||
::testing::ValuesIn(bf16InpOutPrc),
|
||||
::testing::Values(InferenceEngine::Layout::ANY),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(additional_config)),
|
||||
::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D)));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_5D_BF16_MemOrder, EltwiseLayerCPUTest, params_5D_BF16, EltwiseLayerCPUTest::getTestCaseName);
|
||||
|
||||
const auto params_5D_BF16_emptyCPUSpec = ::testing::Combine(
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(inShapes_5D),
|
||||
::testing::ValuesIn(eltwiseOpTypesDiffInp),
|
||||
::testing::ValuesIn(secondaryInputTypes),
|
||||
::testing::ValuesIn(opTypes),
|
||||
::testing::Values(InferenceEngine::Precision::BF16),
|
||||
::testing::ValuesIn(bf16InpOutPrc),
|
||||
::testing::ValuesIn(bf16InpOutPrc),
|
||||
::testing::Values(InferenceEngine::Layout::ANY),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(additional_config)),
|
||||
::testing::Values(emptyCPUSpec));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_5D_BF16, EltwiseLayerCPUTest, params_5D_BF16_emptyCPUSpec, EltwiseLayerCPUTest::getTestCaseName);
|
||||
|
||||
std::vector<std::vector<std::vector<size_t>>> inShapes_4D_Blocked_Planar = {
|
||||
{{2, 17, 31, 3}, {2, 1, 31, 3}},
|
||||
@ -232,12 +267,12 @@ std::vector<CPUSpecificParams> cpuParams_4D_Blocked_Planar = {
|
||||
const auto params_4D_FP32_Blocked_Planar = ::testing::Combine(
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(inShapes_4D_Blocked_Planar),
|
||||
::testing::ValuesIn(eltwiseOpTypes),
|
||||
::testing::ValuesIn(eltwiseOpTypesBinInp),
|
||||
::testing::Values(ngraph::helpers::InputLayerType::CONSTANT),
|
||||
::testing::ValuesIn(opTypes),
|
||||
::testing::Values(InferenceEngine::Precision::FP32),
|
||||
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
|
||||
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
|
||||
::testing::Values(InferenceEngine::Precision::FP32),
|
||||
::testing::Values(InferenceEngine::Precision::FP32),
|
||||
::testing::Values(InferenceEngine::Layout::ANY),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(additional_config)),
|
||||
@ -258,12 +293,12 @@ std::vector<CPUSpecificParams> cpuParams_4D_Planar_Blocked = {
|
||||
const auto params_4D_FP32_Planar_Blocked = ::testing::Combine(
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(inShapes_4D_Planar_Blocked),
|
||||
::testing::ValuesIn(eltwiseOpTypes),
|
||||
::testing::ValuesIn(eltwiseOpTypesBinInp),
|
||||
::testing::Values(ngraph::helpers::InputLayerType::CONSTANT),
|
||||
::testing::ValuesIn(opTypes),
|
||||
::testing::Values(InferenceEngine::Precision::FP32),
|
||||
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
|
||||
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
|
||||
::testing::Values(InferenceEngine::Precision::FP32),
|
||||
::testing::Values(InferenceEngine::Precision::FP32),
|
||||
::testing::Values(InferenceEngine::Layout::ANY),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(additional_config)),
|
||||
@ -284,12 +319,12 @@ std::vector<CPUSpecificParams> cpuParams_5D_Blocked_Planar = {
|
||||
const auto params_5D_FP32_Blocked_Planar = ::testing::Combine(
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(inShapes_5D_Blocked_Planar),
|
||||
::testing::ValuesIn(eltwiseOpTypes),
|
||||
::testing::ValuesIn(eltwiseOpTypesBinInp),
|
||||
::testing::Values(ngraph::helpers::InputLayerType::CONSTANT),
|
||||
::testing::ValuesIn(opTypes),
|
||||
::testing::Values(InferenceEngine::Precision::FP32),
|
||||
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
|
||||
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
|
||||
::testing::Values(InferenceEngine::Precision::FP32),
|
||||
::testing::Values(InferenceEngine::Precision::FP32),
|
||||
::testing::Values(InferenceEngine::Layout::ANY),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(additional_config)),
|
||||
@ -310,12 +345,12 @@ std::vector<CPUSpecificParams> cpuParams_5D_Planar_Blocked = {
|
||||
const auto params_5D_FP32_Planar_Blocked = ::testing::Combine(
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(inShapes_5D_Planar_Blocked),
|
||||
::testing::ValuesIn(eltwiseOpTypes),
|
||||
::testing::ValuesIn(eltwiseOpTypesBinInp),
|
||||
::testing::Values(ngraph::helpers::InputLayerType::CONSTANT),
|
||||
::testing::ValuesIn(opTypes),
|
||||
::testing::Values(InferenceEngine::Precision::FP32),
|
||||
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
|
||||
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
|
||||
::testing::Values(InferenceEngine::Precision::FP32),
|
||||
::testing::Values(InferenceEngine::Precision::FP32),
|
||||
::testing::Values(InferenceEngine::Layout::ANY),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(additional_config)),
|
||||
|
@ -57,7 +57,7 @@ protected:
|
||||
auto groupConv = std::dynamic_pointer_cast<ngraph::opset1::GroupConvolution>(
|
||||
ngraph::builder::makeGroupConvolution(paramOuts[0], ngPrc, kernel, stride, padBegin,
|
||||
padEnd, dilation, padType, convOutChannels, numGroups));
|
||||
groupConv->get_rt_info() = setCPUInfo(inFmts, outFmts, priority);
|
||||
groupConv->get_rt_info() = getCPUInfo();
|
||||
ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(groupConv)};
|
||||
function = std::make_shared<ngraph::Function>(results, params, "groupConvolution");
|
||||
}
|
||||
@ -67,7 +67,7 @@ TEST_P(GroupConvolutionLayerCPUTest, CompareWithRefs) {
|
||||
SKIP_IF_CURRENT_TEST_IS_DISABLED()
|
||||
|
||||
Run();
|
||||
CheckCPUImpl(executableNetwork, "Convolution", inFmts, outFmts, selectedType);
|
||||
CheckCPUImpl(executableNetwork, "Convolution");
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
@ -78,21 +78,17 @@ protected:
|
||||
scalesInput,
|
||||
axesInput,
|
||||
interpolateAttributes);
|
||||
interpolate->get_rt_info() = CPUTestsBase::setCPUInfo(inFmts, outFmts, priority);
|
||||
interpolate->get_rt_info() = getCPUInfo();
|
||||
const ngraph::ResultVector results{std::make_shared<ngraph::opset3::Result>(interpolate)};
|
||||
function = std::make_shared<ngraph::Function>(results, params, "interpolate");
|
||||
}
|
||||
|
||||
std::vector<cpu_memory_format_t> inFmts, outFmts;
|
||||
std::vector<std::string> priority;
|
||||
std::string selectedType;
|
||||
};
|
||||
|
||||
TEST_P(InterpolateLayerCPUTest, CompareWithRefs) {
|
||||
SKIP_IF_CURRENT_TEST_IS_DISABLED()
|
||||
|
||||
Run();
|
||||
CheckCPUImpl(executableNetwork, "Interpolate", inFmts, outFmts, selectedType);
|
||||
CheckCPUImpl(executableNetwork, "Interpolate");
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
@ -0,0 +1,158 @@
|
||||
// Copyright (C) 2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <single_layer_tests/logical.hpp>
|
||||
#include "ngraph_functions/builders.hpp"
|
||||
#include "test_utils/cpu_test_utils.hpp"
|
||||
|
||||
using namespace InferenceEngine;
|
||||
using namespace CPUTestUtils;
|
||||
using namespace ngraph::helpers;
|
||||
|
||||
namespace CPULayerTestsDefinitions {
|
||||
|
||||
typedef std::tuple<
|
||||
LayerTestsDefinitions::LogicalTestParams,
|
||||
CPUSpecificParams>
|
||||
LogicalLayerCPUTestParamSet;
|
||||
|
||||
class LogicalLayerCPUTest : public testing::WithParamInterface<LogicalLayerCPUTestParamSet>,
|
||||
virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase {
|
||||
public:
|
||||
static std::string getTestCaseName(testing::TestParamInfo<LogicalLayerCPUTestParamSet> obj) {
|
||||
LayerTestsDefinitions::LogicalTestParams basicParamsSet;
|
||||
CPUSpecificParams cpuParams;
|
||||
std::tie(basicParamsSet, cpuParams) = obj.param;
|
||||
|
||||
std::ostringstream result;
|
||||
result << LayerTestsDefinitions::LogicalLayerTest::getTestCaseName(testing::TestParamInfo<LayerTestsDefinitions::LogicalTestParams>(
|
||||
basicParamsSet, 0));
|
||||
|
||||
result << CPUTestsBase::getTestCaseName(cpuParams);
|
||||
|
||||
return result.str();
|
||||
}
|
||||
|
||||
protected:
|
||||
void SetUp() override {
|
||||
LayerTestsDefinitions::LogicalTestParams basicParamsSet;
|
||||
CPUSpecificParams cpuParams;
|
||||
std::tie(basicParamsSet, cpuParams) = this->GetParam();
|
||||
|
||||
std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
|
||||
|
||||
LayerTestsDefinitions::LogicalParams::InputShapesTuple inputShapes;
|
||||
ngraph::helpers::LogicalTypes logicalOpType;
|
||||
ngraph::helpers::InputLayerType secondInputType;
|
||||
InferenceEngine::Precision netPrecision;
|
||||
std::string targetName;
|
||||
std::map<std::string, std::string> additional_config;
|
||||
std::tie(inputShapes, logicalOpType, secondInputType, netPrecision, inPrc, outPrc,
|
||||
inLayout, outLayout, targetDevice, additional_config) = basicParamsSet;
|
||||
|
||||
selectedType = getPrimitiveType() + "_" + inPrc.name();
|
||||
|
||||
auto ngInputsPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(Precision::BOOL); // Because ngraph supports only boolean input for logical ops
|
||||
configuration.insert(additional_config.begin(), additional_config.end());
|
||||
|
||||
auto inputs = ngraph::builder::makeParams(ngInputsPrc, {inputShapes.first});
|
||||
|
||||
std::shared_ptr<ngraph::Node> logicalNode;
|
||||
if (logicalOpType != ngraph::helpers::LogicalTypes::LOGICAL_NOT) {
|
||||
auto secondInput = ngraph::builder::makeInputLayer(ngInputsPrc, secondInputType, inputShapes.second);
|
||||
if (secondInputType == ngraph::helpers::InputLayerType::PARAMETER) {
|
||||
inputs.push_back(std::dynamic_pointer_cast<ngraph::opset3::Parameter>(secondInput));
|
||||
}
|
||||
logicalNode = ngraph::builder::makeLogical(inputs[0], secondInput, logicalOpType);
|
||||
} else {
|
||||
logicalNode = ngraph::builder::makeLogical(inputs[0], ngraph::Output<ngraph::Node>(), logicalOpType);
|
||||
}
|
||||
|
||||
logicalNode->get_rt_info() = getCPUInfo();
|
||||
|
||||
function = std::make_shared<ngraph::Function>(logicalNode, inputs, "Logical");
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(LogicalLayerCPUTest, CompareWithRefs) {
|
||||
SKIP_IF_CURRENT_TEST_IS_DISABLED()
|
||||
|
||||
Run();
|
||||
CheckCPUImpl(executableNetwork, "Eltwise");
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
std::map<std::vector<size_t>, std::vector<std::vector<size_t >>> inputShapes = {
|
||||
{{1}, {{1}, {17}, {1, 1}, {2, 18}, {1, 1, 2}, {2, 2, 3}, {1, 1, 2, 3}}},
|
||||
{{5}, {{1}, {1, 1}, {2, 5}, {1, 1, 1}, {2, 2, 5}}},
|
||||
{{2, 200}, {{1}, {200}, {1, 200}, {2, 200}, {2, 2, 200}}},
|
||||
{{1, 3, 20}, {{20}, {2, 1, 1}}},
|
||||
{{2, 17, 3, 4}, {{4}, {1, 3, 4}, {2, 1, 3, 4}}},
|
||||
{{2, 1, 1, 3, 1}, {{1}, {1, 3, 4}, {2, 1, 3, 4}, {1, 1, 1, 1, 1}}},
|
||||
};
|
||||
|
||||
std::map<std::vector<size_t>, std::vector<std::vector<size_t >>> inputShapesNot = {
|
||||
{{1}, {}},
|
||||
{{5}, {}},
|
||||
{{2, 200}, {}},
|
||||
{{1, 3, 20}, {}},
|
||||
{{2, 17, 3, 4}, {}},
|
||||
{{2, 1, 1, 3, 1}, {}},
|
||||
};
|
||||
|
||||
std::vector<InferenceEngine::Precision> inputsPrecisions = {
|
||||
InferenceEngine::Precision::BOOL,
|
||||
};
|
||||
|
||||
std::vector<ngraph::helpers::LogicalTypes> logicalOpTypes = {
|
||||
ngraph::helpers::LogicalTypes::LOGICAL_AND,
|
||||
ngraph::helpers::LogicalTypes::LOGICAL_OR,
|
||||
ngraph::helpers::LogicalTypes::LOGICAL_XOR,
|
||||
};
|
||||
|
||||
std::vector<ngraph::helpers::InputLayerType> secondInputTypes = {
|
||||
ngraph::helpers::InputLayerType::CONSTANT,
|
||||
ngraph::helpers::InputLayerType::PARAMETER,
|
||||
};
|
||||
|
||||
std::map<std::string, std::string> additional_config;
|
||||
|
||||
std::vector<Precision> bf16InpOutPrc = {Precision::BF16, Precision::FP32};
|
||||
|
||||
const auto LogicalTestParams = ::testing::Combine(
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(LayerTestsDefinitions::LogicalLayerTest::combineShapes(inputShapes)),
|
||||
::testing::ValuesIn(logicalOpTypes),
|
||||
::testing::ValuesIn(secondInputTypes),
|
||||
::testing::Values(Precision::BF16),
|
||||
::testing::ValuesIn(bf16InpOutPrc),
|
||||
::testing::ValuesIn(bf16InpOutPrc),
|
||||
::testing::Values(Layout::ANY),
|
||||
::testing::Values(Layout::ANY),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(additional_config)),
|
||||
::testing::Values(emptyCPUSpec));
|
||||
|
||||
const auto LogicalTestParamsNot = ::testing::Combine(
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(LayerTestsDefinitions::LogicalLayerTest::combineShapes(inputShapesNot)),
|
||||
::testing::Values(ngraph::helpers::LogicalTypes::LOGICAL_NOT),
|
||||
::testing::Values(ngraph::helpers::InputLayerType::CONSTANT),
|
||||
::testing::Values(Precision::BF16),
|
||||
::testing::ValuesIn(bf16InpOutPrc),
|
||||
::testing::ValuesIn(bf16InpOutPrc),
|
||||
::testing::Values(Layout::ANY),
|
||||
::testing::Values(Layout::ANY),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(additional_config)),
|
||||
::testing::Values(emptyCPUSpec));
|
||||
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_Logical_Eltwise_CPU_BF16, LogicalLayerCPUTest, LogicalTestParams, LogicalLayerCPUTest::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_Logical_Not_Eltwise_CPU_BF16, LogicalLayerCPUTest, LogicalTestParamsNot, LogicalLayerCPUTest::getTestCaseName);
|
||||
|
||||
} // namespace
|
||||
} // namespace CPULayerTestsDefinitions
|
@ -0,0 +1,200 @@
|
||||
// Copyright (C) 2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <single_layer_tests/mvn.hpp>
|
||||
#include "ngraph_functions/builders.hpp"
|
||||
#include "test_utils/cpu_test_utils.hpp"
|
||||
|
||||
using namespace InferenceEngine;
|
||||
using namespace CPUTestUtils;
|
||||
|
||||
namespace CPULayerTestsDefinitions {
|
||||
|
||||
typedef std::tuple<
|
||||
LayerTestsDefinitions::mvnParams,
|
||||
CPUSpecificParams,
|
||||
Precision, // CNNNetwork input precision
|
||||
Precision> // CNNNetwork output precision
|
||||
MvnLayerCPUTestParamSet;
|
||||
|
||||
class MvnLayerCPUTest : public testing::WithParamInterface<MvnLayerCPUTestParamSet>,
|
||||
virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase {
|
||||
public:
|
||||
static std::string getTestCaseName(testing::TestParamInfo<MvnLayerCPUTestParamSet> obj) {
|
||||
LayerTestsDefinitions::mvnParams basicParamsSet;
|
||||
CPUSpecificParams cpuParams;
|
||||
Precision inputPrecision, outputPrecision;
|
||||
std::tie(basicParamsSet, cpuParams, inputPrecision, outputPrecision) = obj.param;
|
||||
|
||||
std::ostringstream result;
|
||||
result << LayerTestsDefinitions::MvnLayerTest::getTestCaseName(testing::TestParamInfo<LayerTestsDefinitions::mvnParams>(
|
||||
basicParamsSet, 0));
|
||||
|
||||
result << "_" << "CNNInpPrc=" << inputPrecision.name();
|
||||
result << "_" << "CNNOutPrc=" << outputPrecision.name();
|
||||
|
||||
result << CPUTestsBase::getTestCaseName(cpuParams);
|
||||
|
||||
return result.str();
|
||||
}
|
||||
protected:
|
||||
void SetUp() override {
|
||||
LayerTestsDefinitions::mvnParams basicParamsSet;
|
||||
CPUSpecificParams cpuParams;
|
||||
std::tie(basicParamsSet, cpuParams, inPrc, outPrc) = this->GetParam();
|
||||
|
||||
std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
|
||||
|
||||
InferenceEngine::SizeVector inputShapes;
|
||||
InferenceEngine::Precision netPrecision;
|
||||
bool acrossChanels, normalizeVariance;
|
||||
double eps;
|
||||
std::tie(inputShapes, netPrecision, acrossChanels, normalizeVariance, eps, targetDevice) = basicParamsSet;
|
||||
auto netPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
|
||||
auto param = ngraph::builder::makeParams(netPrc, {inputShapes});
|
||||
auto paramOuts = ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(param));
|
||||
auto mvn = ngraph::builder::makeMVN(paramOuts[0], acrossChanels, normalizeVariance, eps);
|
||||
ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(mvn)};
|
||||
|
||||
selectedType = getPrimitiveType() + "_" + inPrc.name();
|
||||
|
||||
threshold = 0.015f;
|
||||
|
||||
mvn->get_rt_info() = getCPUInfo();
|
||||
|
||||
function = std::make_shared<ngraph::Function>(results, param, "mvn");
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(MvnLayerCPUTest, CompareWithRefs) {
|
||||
SKIP_IF_CURRENT_TEST_IS_DISABLED()
|
||||
|
||||
Run();
|
||||
CheckCPUImpl(executableNetwork, "MVN");
|
||||
}
|
||||
|
||||
namespace {
|
||||
const std::vector<std::vector<size_t>> inputShapes_3D = {
|
||||
{1, 32, 17},
|
||||
{1, 37, 9},
|
||||
};
|
||||
|
||||
const std::vector<std::vector<size_t>> inputShapes_4D = {
|
||||
{1, 16, 5, 8},
|
||||
{2, 19, 5, 10},
|
||||
{7, 32, 2, 8},
|
||||
{5, 8, 3, 5},
|
||||
{4, 41, 6, 9}
|
||||
};
|
||||
|
||||
const std::vector<std::vector<size_t>> inputShapes_5D = {
|
||||
{1, 32, 8, 1, 6},
|
||||
{1, 9, 1, 15, 9},
|
||||
{6, 64, 6, 1, 18},
|
||||
{2, 31, 2, 9, 1},
|
||||
{10, 16, 5, 10, 6}
|
||||
};
|
||||
|
||||
const std::vector<bool> acrossChannels = {
|
||||
true,
|
||||
false
|
||||
};
|
||||
|
||||
const std::vector<bool> normalizeVariance = {
|
||||
true,
|
||||
false
|
||||
};
|
||||
|
||||
const std::vector<double> epsilon = {
|
||||
0.000000001
|
||||
};
|
||||
|
||||
std::vector<Precision> inpOutPrc = {Precision::BF16, Precision::FP32};
|
||||
|
||||
std::vector<CPUSpecificParams> cpuParams_4D = {
|
||||
CPUSpecificParams({nChw16c}, {nChw16c}, {}, {}),
|
||||
CPUSpecificParams({nchw}, {nchw}, {}, {})
|
||||
};
|
||||
|
||||
std::vector<CPUSpecificParams> cpuParams_5D = {
|
||||
CPUSpecificParams({nCdhw16c}, {nCdhw16c}, {}, {}),
|
||||
CPUSpecificParams({ncdhw}, {ncdhw}, {}, {})
|
||||
};
|
||||
|
||||
const auto Mvn3D = ::testing::Combine(
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(inputShapes_3D),
|
||||
::testing::Values(InferenceEngine::Precision::FP32),
|
||||
::testing::ValuesIn(acrossChannels),
|
||||
::testing::ValuesIn(normalizeVariance),
|
||||
::testing::ValuesIn(epsilon),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
::testing::Values(emptyCPUSpec),
|
||||
::testing::ValuesIn(inpOutPrc),
|
||||
::testing::ValuesIn(inpOutPrc));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_3D, MvnLayerCPUTest, Mvn3D, MvnLayerCPUTest::getTestCaseName);
|
||||
|
||||
const auto Mvn4D = ::testing::Combine(
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(inputShapes_4D),
|
||||
::testing::Values(InferenceEngine::Precision::FP32),
|
||||
::testing::ValuesIn(acrossChannels),
|
||||
::testing::ValuesIn(normalizeVariance),
|
||||
::testing::ValuesIn(epsilon),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D)),
|
||||
::testing::ValuesIn(inpOutPrc),
|
||||
::testing::ValuesIn(inpOutPrc));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_4D, MvnLayerCPUTest, Mvn4D, MvnLayerCPUTest::getTestCaseName);
|
||||
|
||||
|
||||
const auto MvnNHWC = ::testing::Combine(
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(inputShapes_4D),
|
||||
::testing::Values(InferenceEngine::Precision::FP32),
|
||||
::testing::Values(false),
|
||||
::testing::Values(true),
|
||||
::testing::ValuesIn(epsilon),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
::testing::Values(CPUSpecificParams({nhwc}, {nhwc}, {}, {})),
|
||||
::testing::ValuesIn(inpOutPrc),
|
||||
::testing::ValuesIn(inpOutPrc));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_4D_NHWC, MvnLayerCPUTest, MvnNHWC, MvnLayerCPUTest::getTestCaseName);
|
||||
|
||||
const auto MvnNDHWC = ::testing::Combine(
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(inputShapes_5D),
|
||||
::testing::Values(InferenceEngine::Precision::FP32),
|
||||
::testing::Values(false),
|
||||
::testing::Values(true),
|
||||
::testing::ValuesIn(epsilon),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
::testing::Values(CPUSpecificParams({ndhwc}, {ndhwc}, {}, {})),
|
||||
::testing::ValuesIn(inpOutPrc),
|
||||
::testing::ValuesIn(inpOutPrc));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_5D_NDHWC, MvnLayerCPUTest, MvnNDHWC, MvnLayerCPUTest::getTestCaseName);
|
||||
|
||||
|
||||
const auto Mvn5D = ::testing::Combine(
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(inputShapes_5D),
|
||||
::testing::Values(InferenceEngine::Precision::FP32),
|
||||
::testing::ValuesIn(acrossChannels),
|
||||
::testing::ValuesIn(normalizeVariance),
|
||||
::testing::ValuesIn(epsilon),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D)),
|
||||
::testing::ValuesIn(inpOutPrc),
|
||||
::testing::ValuesIn(inpOutPrc));
|
||||
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_5D, MvnLayerCPUTest, Mvn5D, MvnLayerCPUTest::getTestCaseName);
|
||||
|
||||
|
||||
} // namespace
|
||||
} // namespace CPULayerTestsDefinitions
|
132
inference-engine/tests/functional/plugin/cpu/single_layer_tests/normalize.cpp
Executable file
132
inference-engine/tests/functional/plugin/cpu/single_layer_tests/normalize.cpp
Executable file
@ -0,0 +1,132 @@
|
||||
// Copyright (C) 2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <single_layer_tests/normalize_l2.hpp>
|
||||
#include "ngraph_functions/builders.hpp"
|
||||
#include "test_utils/cpu_test_utils.hpp"
|
||||
|
||||
using namespace InferenceEngine;
|
||||
using namespace CPUTestUtils;
|
||||
|
||||
namespace CPULayerTestsDefinitions {
|
||||
|
||||
typedef std::tuple<
|
||||
LayerTestsDefinitions::NormalizeL2LayerTestParams,
|
||||
CPUSpecificParams>
|
||||
NormalizeL2LayerCPUTestParamSet;
|
||||
|
||||
class NormalizeL2LayerCPUTest : public testing::WithParamInterface<NormalizeL2LayerCPUTestParamSet>,
|
||||
virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase {
|
||||
public:
|
||||
static std::string getTestCaseName(testing::TestParamInfo<NormalizeL2LayerCPUTestParamSet> obj) {
|
||||
LayerTestsDefinitions::NormalizeL2LayerTestParams basicParamsSet;
|
||||
CPUSpecificParams cpuParams;
|
||||
Precision inputPrecision, outputPrecision;
|
||||
std::tie(basicParamsSet, cpuParams) = obj.param;
|
||||
|
||||
std::ostringstream result;
|
||||
result << LayerTestsDefinitions::NormalizeL2LayerTest::getTestCaseName(testing::TestParamInfo<LayerTestsDefinitions::NormalizeL2LayerTestParams>(
|
||||
basicParamsSet, 0));
|
||||
|
||||
result << CPUTestsBase::getTestCaseName(cpuParams);
|
||||
|
||||
return result.str();
|
||||
}
|
||||
protected:
|
||||
void SetUp() override {
|
||||
LayerTestsDefinitions::NormalizeL2LayerTestParams basicParamsSet;
|
||||
CPUSpecificParams cpuParams;
|
||||
std::tie(basicParamsSet, cpuParams) = this->GetParam();
|
||||
|
||||
std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
|
||||
|
||||
std::vector<int64_t> axes;
|
||||
float eps;
|
||||
ngraph::op::EpsMode eps_mode;
|
||||
InferenceEngine::SizeVector inputShapes;
|
||||
InferenceEngine::Precision netPrecision;
|
||||
std::tie(axes, eps, eps_mode, inputShapes, netPrecision, targetDevice) = basicParamsSet;
|
||||
inPrc = outPrc = netPrecision;
|
||||
auto netPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
|
||||
auto param = ngraph::builder::makeParams(netPrc, {inputShapes});
|
||||
auto paramOuts = ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(param));
|
||||
auto normalize_l2 = ngraph::builder::makeNormalizeL2(paramOuts[0], axes, eps, eps_mode);
|
||||
|
||||
ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(normalize_l2)};
|
||||
|
||||
if (Precision::BF16 == netPrecision) {
|
||||
selectedType = "unknown_BF16";
|
||||
} else if (Precision::FP32 == netPrecision) {
|
||||
selectedType = "unknown_FP32";
|
||||
}
|
||||
|
||||
threshold = 0.015f;
|
||||
|
||||
normalize_l2->get_rt_info() = getCPUInfo();
|
||||
|
||||
function = std::make_shared<ngraph::Function>(results, param, "Normalize");
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(NormalizeL2LayerCPUTest, CompareWithRefs) {
|
||||
SKIP_IF_CURRENT_TEST_IS_DISABLED()
|
||||
|
||||
Run();
|
||||
CheckCPUImpl(executableNetwork, "Normalize");
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
const std::vector<std::vector<int64_t>> axes = {
|
||||
{},
|
||||
{1},
|
||||
};
|
||||
const std::vector<float> eps = { 1e-4f };
|
||||
|
||||
const std::vector<ngraph::op::EpsMode> epsMode = {
|
||||
ngraph::op::EpsMode::ADD,
|
||||
ngraph::op::EpsMode::MAX,
|
||||
};
|
||||
|
||||
std::vector<Precision> inpOutPrc = {Precision::BF16};
|
||||
|
||||
std::vector<CPUSpecificParams> cpuParams_4D = {
|
||||
CPUSpecificParams({nChw16c}, {nChw16c}, {}, {}),
|
||||
CPUSpecificParams({nhwc}, {nhwc}, {}, {}),
|
||||
CPUSpecificParams({nchw}, {nchw}, {}, {})
|
||||
};
|
||||
|
||||
|
||||
const std::vector<Precision> netPrecisions = {
|
||||
Precision::FP32,
|
||||
Precision::BF16
|
||||
};
|
||||
|
||||
const auto NormalizeL23D = testing::Combine(
|
||||
testing::Combine(
|
||||
testing::ValuesIn(axes),
|
||||
testing::ValuesIn(eps),
|
||||
testing::ValuesIn(epsMode),
|
||||
testing::Values(std::vector<size_t>{1, 32, 17}),
|
||||
testing::ValuesIn(netPrecisions),
|
||||
testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
testing::Values(emptyCPUSpec));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_NormalizeL2CompareWithRefs_3D, NormalizeL2LayerCPUTest, NormalizeL23D, NormalizeL2LayerCPUTest::getTestCaseName);
|
||||
|
||||
const auto NormalizeL24D = testing::Combine(
|
||||
testing::Combine(
|
||||
testing::ValuesIn(axes),
|
||||
testing::ValuesIn(eps),
|
||||
testing::ValuesIn(epsMode),
|
||||
testing::Values(std::vector<size_t>{1, 3, 10, 5}),
|
||||
testing::ValuesIn(netPrecisions),
|
||||
testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D)));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_NormalizeL2CompareWithRefs_4D, NormalizeL2LayerCPUTest, NormalizeL24D, NormalizeL2LayerCPUTest::getTestCaseName);
|
||||
|
||||
|
||||
} // namespace
|
||||
} // namespace CPULayerTestsDefinitions
|
@ -0,0 +1,148 @@
|
||||
// Copyright (C) 2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <single_layer_tests/transpose.hpp>
|
||||
#include "ngraph_functions/builders.hpp"
|
||||
#include "test_utils/cpu_test_utils.hpp"
|
||||
|
||||
// Since the Transpose ngraph operation is converted to the permute node, we will use it in the permute test
|
||||
|
||||
using namespace InferenceEngine;
|
||||
using namespace CPUTestUtils;
|
||||
|
||||
namespace CPULayerTestsDefinitions {
|
||||
|
||||
typedef std::tuple<
|
||||
std::vector<size_t>, // Input order
|
||||
InferenceEngine::Precision, // Net precision
|
||||
std::vector<size_t>, // Input shapes
|
||||
std::string, // Target device name
|
||||
std::map<std::string, std::string>, // Additional network configuration
|
||||
CPUSpecificParams> PermuteLayerCPUTestParamSet;
|
||||
|
||||
class PermuteLayerCPUTest : public testing::WithParamInterface<PermuteLayerCPUTestParamSet>,
|
||||
virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase {
|
||||
public:
|
||||
static std::string getTestCaseName(testing::TestParamInfo<PermuteLayerCPUTestParamSet> obj) {
|
||||
Precision netPrecision;
|
||||
std::vector<size_t> inputShape, inputOrder;
|
||||
std::string targetDevice;
|
||||
CPUSpecificParams cpuParams;
|
||||
std::map<std::string, std::string> additionalConfig;
|
||||
std::tie(inputOrder, netPrecision, inputShape, targetDevice, additionalConfig, cpuParams) = obj.param;
|
||||
|
||||
std::ostringstream result;
|
||||
result << "IS=" << CommonTestUtils::vec2str(inputShape) << "_";
|
||||
result << "inputOrder=" << CommonTestUtils::vec2str(inputOrder) << "_";
|
||||
result << "netPRC=" << netPrecision.name() << "_";
|
||||
result << "trgDev=" << targetDevice;
|
||||
result << CPUTestsBase::getTestCaseName(cpuParams);
|
||||
return result.str();
|
||||
}
|
||||
protected:
|
||||
void SetUp() override {
|
||||
SetRefMode(LayerTestsUtils::RefMode::CONSTANT_FOLDING);
|
||||
|
||||
Precision netPrecision;
|
||||
std::vector<size_t> inputShape, inputOrder;
|
||||
CPUSpecificParams cpuParams;
|
||||
std::map<std::string, std::string> additionalConfig;
|
||||
std::tie(inputOrder, netPrecision, inputShape, targetDevice, additionalConfig, cpuParams) = this->GetParam();
|
||||
configuration.insert(additionalConfig.begin(), additionalConfig.end());
|
||||
inPrc = outPrc = netPrecision; // since the layer does not convert precisions
|
||||
|
||||
std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
|
||||
|
||||
selectedType = std::string("unknown_") + inPrc.name();
|
||||
|
||||
auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
|
||||
auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
|
||||
auto paramOuts = ngraph::helpers::convert2OutputVector(
|
||||
ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));
|
||||
|
||||
const auto inOrderShape = inputOrder.empty() ? ngraph::Shape({0}) : ngraph::Shape({inputShape.size()});
|
||||
const auto inputOrderOp = std::make_shared<ngraph::opset3::Constant>(ngraph::element::i64,
|
||||
inOrderShape,
|
||||
inputOrder);
|
||||
const auto transpose = std::make_shared<ngraph::opset3::Transpose>(paramOuts.at(0), inputOrderOp);
|
||||
transpose->get_rt_info() = getCPUInfo();
|
||||
const ngraph::ResultVector results{std::make_shared<ngraph::opset3::Result>(transpose)};
|
||||
function = std::make_shared<ngraph::Function>(results, params, "Transpose");
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(PermuteLayerCPUTest, CompareWithRefs) {
|
||||
SKIP_IF_CURRENT_TEST_IS_DISABLED()
|
||||
|
||||
Run();
|
||||
CheckCPUImpl(executableNetwork, "Permute");
|
||||
}
|
||||
|
||||
namespace {
|
||||
std::map<std::string, std::string> additional_config;
|
||||
|
||||
const std::vector<InferenceEngine::Precision> netPrecisions = {
|
||||
Precision::BF16,
|
||||
Precision::FP32
|
||||
};
|
||||
|
||||
const std::vector<std::vector<size_t>> inputShapes4D = {
|
||||
{2, 32, 10, 20}
|
||||
};
|
||||
|
||||
const std::vector<std::vector<size_t>> inputOrder4D = {
|
||||
std::vector<size_t>{0, 1, 2, 3},
|
||||
std::vector<size_t>{0, 2, 3, 1},
|
||||
std::vector<size_t>{0, 2, 1, 3},
|
||||
std::vector<size_t>{1, 0, 2, 3},
|
||||
std::vector<size_t>{},
|
||||
};
|
||||
|
||||
std::vector<CPUSpecificParams> cpuParams_4D = {
|
||||
CPUSpecificParams({nChw16c}, {}, {}, {}),
|
||||
CPUSpecificParams({nchw}, {}, {}, {}),
|
||||
};
|
||||
|
||||
const auto params4D = ::testing::Combine(
|
||||
::testing::ValuesIn(inputOrder4D),
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::ValuesIn(inputShapes4D),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(additional_config),
|
||||
::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D)));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_Permute4D_CPU, PermuteLayerCPUTest, params4D, PermuteLayerCPUTest::getTestCaseName);
|
||||
|
||||
const std::vector<std::vector<size_t>> inputShapes5D = {
|
||||
{2, 32, 5, 10, 20}
|
||||
};
|
||||
|
||||
const std::vector<std::vector<size_t>> inputOrder5D = {
|
||||
std::vector<size_t>{0, 1, 2, 3, 4},
|
||||
std::vector<size_t>{0, 4, 2, 3, 1},
|
||||
std::vector<size_t>{0, 4, 2, 1, 3},
|
||||
std::vector<size_t>{0, 2, 4, 3, 1},
|
||||
std::vector<size_t>{0, 3, 2, 4, 1},
|
||||
std::vector<size_t>{0, 3, 1, 4, 2},
|
||||
std::vector<size_t>{1, 0, 2, 3, 4},
|
||||
std::vector<size_t>{},
|
||||
};
|
||||
|
||||
std::vector<CPUSpecificParams> cpuParams_5D = {
|
||||
CPUSpecificParams({nCdhw16c}, {}, {}, {}),
|
||||
CPUSpecificParams({ncdhw}, {}, {}, {}),
|
||||
};
|
||||
|
||||
const auto params5D = ::testing::Combine(
|
||||
::testing::ValuesIn(inputOrder5D),
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::ValuesIn(inputShapes5D),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(additional_config),
|
||||
::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D)));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_Permute5D_CPU, PermuteLayerCPUTest, params5D, PermuteLayerCPUTest::getTestCaseName);
|
||||
|
||||
} // namespace
|
||||
} // namespace CPULayerTestsDefinitions
|
@ -0,0 +1,352 @@
|
||||
// Copyright (C) 2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <single_layer_tests/reduce_ops.hpp>
|
||||
#include "ngraph_functions/builders.hpp"
|
||||
#include "test_utils/cpu_test_utils.hpp"
|
||||
|
||||
using namespace InferenceEngine;
|
||||
using namespace CPUTestUtils;
|
||||
using namespace LayerTestsDefinitions;
|
||||
|
||||
namespace CPULayerTestsDefinitions {
|
||||
|
||||
typedef std::tuple<reduceMeanParams, CPUSpecificParams> ReduceLayerCPUTestParamSet;
|
||||
|
||||
class ReduceCPULayerTest : public testing::WithParamInterface<ReduceLayerCPUTestParamSet>,
|
||||
virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase {
|
||||
public:
|
||||
static std::string getTestCaseName(testing::TestParamInfo<ReduceLayerCPUTestParamSet> obj) {
|
||||
reduceMeanParams basicParamsSet;
|
||||
CPUSpecificParams cpuParams;
|
||||
std::tie(basicParamsSet, cpuParams) = obj.param;
|
||||
|
||||
std::ostringstream result;
|
||||
result << LayerTestsDefinitions::ReduceOpsLayerTest::getTestCaseName(testing::TestParamInfo<reduceMeanParams>(
|
||||
basicParamsSet, 0));
|
||||
result << CPUTestsBase::getTestCaseName(cpuParams);
|
||||
|
||||
return result.str();
|
||||
}
|
||||
protected:
|
||||
void SetUp() override {
|
||||
reduceMeanParams basicParamsSet;
|
||||
CPUSpecificParams cpuParams;
|
||||
std::tie(basicParamsSet, cpuParams) = this->GetParam();
|
||||
|
||||
std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
|
||||
|
||||
InferenceEngine::Precision netPrecision;
|
||||
bool keepDims;
|
||||
std::vector<size_t> inputShape;
|
||||
std::vector<int> axes;
|
||||
CommonTestUtils::OpType opType;
|
||||
std::tie(axes, opType, keepDims, reductionType, netPrecision, inPrc, outPrc, inLayout, inputShape, targetDevice) = basicParamsSet;
|
||||
|
||||
auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
|
||||
auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
|
||||
auto paramOuts = ngraph::helpers::convert2OutputVector(
|
||||
ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));
|
||||
|
||||
std::vector<size_t> shapeAxes;
|
||||
switch (opType) {
|
||||
case CommonTestUtils::OpType::SCALAR: {
|
||||
if (axes.size() > 1)
|
||||
FAIL() << "In reduce op if op type is scalar, 'axis' input's must contain 1 element";
|
||||
break;
|
||||
}
|
||||
case CommonTestUtils::OpType::VECTOR: {
|
||||
shapeAxes.push_back(axes.size());
|
||||
break;
|
||||
}
|
||||
default:
|
||||
FAIL() << "Reduce op doesn't support operation type: " << opType;
|
||||
}
|
||||
auto reductionAxesNode = std::dynamic_pointer_cast<ngraph::Node>(
|
||||
std::make_shared<ngraph::opset3::Constant>(ngraph::element::Type_t::i64, ngraph::Shape(shapeAxes), axes));
|
||||
|
||||
const auto reduce = ngraph::builder::makeReduce(paramOuts[0], reductionAxesNode, keepDims, reductionType);
|
||||
|
||||
selectedType = getPrimitiveType() + "_" + inPrc.name();
|
||||
|
||||
reduce->get_rt_info() = getCPUInfo();
|
||||
|
||||
const ngraph::ResultVector results{std::make_shared<ngraph::opset3::Result>(reduce)};
|
||||
function = std::make_shared<ngraph::Function>(results, params, "Reduce");
|
||||
}
|
||||
InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo &info) const override {
|
||||
if (ngraph::helpers::ReductionType::Prod == reductionType) {
|
||||
// We change the range of random values to avoid possible floating point overflow
|
||||
auto blob = FuncTestUtils::createAndFillBlob(info.getTensorDesc(), 10, 5);
|
||||
if (Precision::FP32 == info.getTensorDesc().getPrecision()) {
|
||||
auto *rawBlobDataPtr = blob->buffer().as<float *>();
|
||||
for (size_t i = 0; i < blob->size(); ++i) {
|
||||
rawBlobDataPtr[i] /= 10.f;
|
||||
}
|
||||
} else if (Precision::BF16 == info.getTensorDesc().getPrecision()) {
|
||||
auto *rawBlobDataPtr = blob->buffer().as<ngraph::bfloat16 *>();
|
||||
for (size_t i = 0; i < blob->size(); ++i) {
|
||||
rawBlobDataPtr[i] /= 10.f;
|
||||
}
|
||||
}
|
||||
return blob;
|
||||
} else {
|
||||
return LayerTestsCommon::GenerateInput(info);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
ngraph::helpers::ReductionType reductionType;
|
||||
};
|
||||
|
||||
TEST_P(ReduceCPULayerTest, CompareWithRefs) {
|
||||
SKIP_IF_CURRENT_TEST_IS_DISABLED()
|
||||
|
||||
auto ops = function->get_ordered_ops();
|
||||
std::string name = (*(++ops.rbegin()))->get_type_name();
|
||||
|
||||
if ("ReduceLogicalAnd" == name) {
|
||||
name = "ReduceAnd";
|
||||
}
|
||||
if ("ReduceLogicalOr" == name) {
|
||||
name = "ReduceOr";
|
||||
}
|
||||
|
||||
Run();
|
||||
CheckCPUImpl(executableNetwork, name);
|
||||
}
|
||||
namespace {
|
||||
std::vector<Precision> inpOutPrc = {Precision::BF16, Precision::FP32};
|
||||
|
||||
const std::vector<bool> keepDims = {
|
||||
true,
|
||||
false,
|
||||
};
|
||||
|
||||
const std::vector<std::vector<int>> axes = {
|
||||
{0},
|
||||
{1},
|
||||
{2},
|
||||
{3}
|
||||
};
|
||||
|
||||
const std::vector<std::vector<int>> axesND = {
|
||||
{0, 1},
|
||||
{0, 2},
|
||||
{0, 3},
|
||||
{1, 2},
|
||||
{1, 3},
|
||||
{2, 3},
|
||||
{0, 1, 2},
|
||||
{0, 1, 3},
|
||||
{0, 2, 3},
|
||||
{1, 2, 3},
|
||||
{0, 1, 2, 3}
|
||||
};
|
||||
|
||||
std::vector<CommonTestUtils::OpType> opTypes = {
|
||||
CommonTestUtils::OpType::SCALAR,
|
||||
CommonTestUtils::OpType::VECTOR,
|
||||
};
|
||||
|
||||
const std::vector<ngraph::helpers::ReductionType> reductionTypes = {
|
||||
// ngraph::helpers::ReductionType::Mean, //optimized out during the graph transformations
|
||||
// ngraph::helpers::ReductionType::Max, //optimized out during the graph transformations
|
||||
// ngraph::helpers::ReductionType::Sum, //optimized out during the graph transformations
|
||||
ngraph::helpers::ReductionType::Min,
|
||||
ngraph::helpers::ReductionType::Prod,
|
||||
ngraph::helpers::ReductionType::L1,
|
||||
ngraph::helpers::ReductionType::L2,
|
||||
};
|
||||
|
||||
const std::vector<ngraph::helpers::ReductionType> reductionLogicalTypes = {
|
||||
ngraph::helpers::ReductionType::LogicalOr,
|
||||
ngraph::helpers::ReductionType::LogicalAnd
|
||||
};
|
||||
|
||||
const std::vector<std::vector<size_t>> inputShapes = {
|
||||
std::vector<size_t>{10, 5, 15, 12},
|
||||
std::vector<size_t>{3, 5, 7, 9},
|
||||
};
|
||||
|
||||
std::vector<CPUSpecificParams> cpuParams_4D = {
|
||||
CPUSpecificParams({nChw16c}, {nChw16c}, {}, {}),
|
||||
CPUSpecificParams({nchw}, {nchw}, {}, {})
|
||||
};
|
||||
|
||||
std::vector<CPUSpecificParams> cpuParams_5D = {
|
||||
CPUSpecificParams({nCdhw16c}, {nCdhw16c}, {}, {}),
|
||||
CPUSpecificParams({ncdhw}, {ncdhw}, {}, {})
|
||||
};
|
||||
|
||||
const auto paramsOneAxis = ::testing::Combine(
|
||||
testing::Combine(
|
||||
testing::ValuesIn(axes),
|
||||
testing::ValuesIn(opTypes),
|
||||
testing::ValuesIn(keepDims),
|
||||
testing::ValuesIn(reductionTypes),
|
||||
testing::Values(InferenceEngine::Precision::FP32),
|
||||
testing::ValuesIn(inpOutPrc),
|
||||
testing::ValuesIn(inpOutPrc),
|
||||
testing::Values(InferenceEngine::Layout::ANY),
|
||||
testing::ValuesIn(inputShapes),
|
||||
testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
testing::Values(emptyCPUSpec));
|
||||
|
||||
const auto paramsOneAxisLogical = testing::Combine(
|
||||
testing::Combine(
|
||||
testing::ValuesIn(axes),
|
||||
testing::ValuesIn(opTypes),
|
||||
testing::ValuesIn(keepDims),
|
||||
testing::ValuesIn(reductionLogicalTypes),
|
||||
testing::Values(InferenceEngine::Precision::BOOL),
|
||||
testing::ValuesIn(inpOutPrc),
|
||||
testing::ValuesIn(inpOutPrc),
|
||||
testing::Values(InferenceEngine::Layout::ANY),
|
||||
testing::ValuesIn(inputShapes),
|
||||
testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
testing::Values(emptyCPUSpec));
|
||||
|
||||
const auto params_MultiAxis = testing::Combine(
|
||||
testing::Combine(
|
||||
testing::ValuesIn(axesND),
|
||||
testing::Values(opTypes[1]),
|
||||
testing::Values(false),
|
||||
testing::ValuesIn(reductionTypes),
|
||||
testing::Values(InferenceEngine::Precision::FP32),
|
||||
testing::ValuesIn(inpOutPrc),
|
||||
testing::ValuesIn(inpOutPrc),
|
||||
testing::Values(InferenceEngine::Layout::ANY),
|
||||
testing::Values(std::vector<size_t>{2, 9, 2, 9}),
|
||||
testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
testing::Values(emptyCPUSpec));
|
||||
|
||||
const auto params_MultiAxis_4D = testing::Combine(
|
||||
testing::Combine(
|
||||
testing::ValuesIn(axesND),
|
||||
testing::Values(opTypes[1]),
|
||||
testing::Values(true),
|
||||
testing::ValuesIn(reductionTypes),
|
||||
testing::Values(InferenceEngine::Precision::FP32),
|
||||
testing::ValuesIn(inpOutPrc),
|
||||
testing::ValuesIn(inpOutPrc),
|
||||
testing::Values(InferenceEngine::Layout::ANY),
|
||||
testing::Values(std::vector<size_t>{2, 19, 2, 9}),
|
||||
testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D)));
|
||||
|
||||
const auto params_MultiAxis_5D = testing::Combine(
|
||||
testing::Combine(
|
||||
testing::ValuesIn(axesND),
|
||||
testing::Values(opTypes[1]),
|
||||
testing::Values(true),
|
||||
testing::ValuesIn(reductionTypes),
|
||||
testing::Values(InferenceEngine::Precision::FP32),
|
||||
testing::ValuesIn(inpOutPrc),
|
||||
testing::ValuesIn(inpOutPrc),
|
||||
testing::Values(InferenceEngine::Layout::ANY),
|
||||
testing::Values(std::vector<size_t>{2, 19, 7, 2, 9}),
|
||||
testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D)));
|
||||
|
||||
const auto params_MultiAxisLogical = testing::Combine(
|
||||
testing::Combine(
|
||||
testing::ValuesIn(axesND),
|
||||
testing::Values(opTypes[1]),
|
||||
testing::Values(false),
|
||||
testing::ValuesIn(reductionLogicalTypes),
|
||||
testing::Values(InferenceEngine::Precision::BOOL),
|
||||
testing::ValuesIn(inpOutPrc),
|
||||
testing::ValuesIn(inpOutPrc),
|
||||
testing::Values(InferenceEngine::Layout::ANY),
|
||||
testing::Values(std::vector<size_t>{2, 9, 2, 9}),
|
||||
testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
testing::Values(emptyCPUSpec));
|
||||
|
||||
const auto params_MultiAxisLogical4D = testing::Combine(
|
||||
testing::Combine(
|
||||
testing::ValuesIn(axesND),
|
||||
testing::Values(opTypes[1]),
|
||||
testing::Values(true),
|
||||
testing::ValuesIn(reductionLogicalTypes),
|
||||
testing::Values(InferenceEngine::Precision::BOOL),
|
||||
testing::ValuesIn(inpOutPrc),
|
||||
testing::ValuesIn(inpOutPrc),
|
||||
testing::Values(InferenceEngine::Layout::ANY),
|
||||
testing::Values(std::vector<size_t>{2, 19, 2, 9}),
|
||||
testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D)));
|
||||
|
||||
const auto params_MultiAxisLogical5D = testing::Combine(
|
||||
testing::Combine(
|
||||
testing::ValuesIn(axesND),
|
||||
testing::Values(opTypes[1]),
|
||||
testing::Values(true),
|
||||
testing::ValuesIn(reductionLogicalTypes),
|
||||
testing::Values(InferenceEngine::Precision::BOOL),
|
||||
testing::ValuesIn(inpOutPrc),
|
||||
testing::ValuesIn(inpOutPrc),
|
||||
testing::Values(InferenceEngine::Layout::ANY),
|
||||
testing::Values(std::vector<size_t>{2, 19, 7, 2, 9}),
|
||||
testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D)));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
smoke_ReduceOneAxis_CPU,
|
||||
ReduceCPULayerTest,
|
||||
paramsOneAxis,
|
||||
ReduceCPULayerTest::getTestCaseName
|
||||
);
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
smoke_ReduceLogicalOneAxis_CPU,
|
||||
ReduceCPULayerTest,
|
||||
paramsOneAxisLogical,
|
||||
ReduceCPULayerTest::getTestCaseName
|
||||
);
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
smoke_Reduce_ReductionTypes_CPU,
|
||||
ReduceCPULayerTest,
|
||||
params_MultiAxis,
|
||||
ReduceCPULayerTest::getTestCaseName
|
||||
);
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
smoke_Reduce_ReductionTypes4D_CPU,
|
||||
ReduceCPULayerTest,
|
||||
params_MultiAxis_4D,
|
||||
ReduceCPULayerTest::getTestCaseName
|
||||
);
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
smoke_Reduce_ReductionTypes5D_CPU,
|
||||
ReduceCPULayerTest,
|
||||
params_MultiAxis_5D,
|
||||
ReduceCPULayerTest::getTestCaseName
|
||||
);
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
smoke_ReduceLogical_ReductionTypes_CPU,
|
||||
ReduceCPULayerTest,
|
||||
params_MultiAxisLogical,
|
||||
ReduceCPULayerTest::getTestCaseName
|
||||
);
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
smoke_ReduceLogical4D_ReductionTypes_CPU,
|
||||
ReduceCPULayerTest,
|
||||
params_MultiAxisLogical4D,
|
||||
ReduceCPULayerTest::getTestCaseName
|
||||
);
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
smoke_ReduceLogical5D_ReductionTypes_CPU,
|
||||
ReduceCPULayerTest,
|
||||
params_MultiAxisLogical5D,
|
||||
ReduceCPULayerTest::getTestCaseName
|
||||
);
|
||||
} // namespace
|
||||
} // namespace CPULayerTestsDefinitions
|
||||
|
@ -0,0 +1,165 @@
|
||||
// Copyright (C) 2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <single_layer_tests/region_yolo.hpp>
|
||||
#include "ngraph_functions/builders.hpp"
|
||||
#include "test_utils/cpu_test_utils.hpp"
|
||||
|
||||
using namespace InferenceEngine;
|
||||
using namespace CPUTestUtils;
|
||||
|
||||
namespace CPULayerTestsDefinitions {
|
||||
|
||||
struct regionYoloAttributes {
|
||||
size_t classes;
|
||||
size_t coordinates;
|
||||
size_t num_regions;
|
||||
bool do_softmax;
|
||||
int start_axis;
|
||||
int end_axis;
|
||||
};
|
||||
|
||||
using regionYoloParamsTuple = std::tuple<
|
||||
ngraph::Shape, // Input Shape
|
||||
regionYoloAttributes, // Params
|
||||
std::vector<int64_t>, // mask
|
||||
InferenceEngine::Precision, // Network input precision
|
||||
InferenceEngine::Precision, // Network output precision
|
||||
std::map<std::string, std::string>, // Additional network configuration
|
||||
std::string>; // Device name
|
||||
|
||||
|
||||
class RegionYoloCPULayerTest : public testing::WithParamInterface<regionYoloParamsTuple>,
|
||||
virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase {
|
||||
public:
|
||||
static std::string getTestCaseName(testing::TestParamInfo<regionYoloParamsTuple> obj) {
|
||||
ngraph::Shape inputShape;
|
||||
regionYoloAttributes attributes;
|
||||
std::vector<int64_t> mask;
|
||||
InferenceEngine::Precision inpPrecision;
|
||||
InferenceEngine::Precision outPrecision;
|
||||
std::string targetName;
|
||||
std::map<std::string, std::string> additionalConfig;
|
||||
|
||||
std::tie(inputShape, attributes, mask, inpPrecision, outPrecision, additionalConfig, targetName) = obj.param;
|
||||
|
||||
std::ostringstream result;
|
||||
result << "IS=" << CommonTestUtils::vec2str(inputShape) << "_";
|
||||
result << "classes=" << attributes.classes << "_";
|
||||
result << "coords=" << attributes.coordinates << "_";
|
||||
result << "num=" << attributes.num_regions << "_";
|
||||
result << "doSoftmax=" << attributes.do_softmax << "_";
|
||||
result << "axis=" << attributes.start_axis << "_";
|
||||
result << "endAxis=" << attributes.end_axis << "_";
|
||||
result << "inpPRC=" << inpPrecision.name() << "_";
|
||||
result << "outPRC=" << outPrecision.name() << "_";
|
||||
result << "targetDevice=" << targetName << "_";
|
||||
return result.str();
|
||||
}
|
||||
protected:
|
||||
void SetUp() override {
|
||||
ngraph::Shape inputShape;
|
||||
regionYoloAttributes attributes;
|
||||
std::vector<int64_t> mask;
|
||||
std::map<std::string, std::string> additionalConfig;
|
||||
|
||||
std::tie(inputShape, attributes, mask, inPrc, outPrc, additionalConfig, targetDevice) = this->GetParam();
|
||||
|
||||
configuration.insert(additionalConfig.begin(), additionalConfig.end());
|
||||
|
||||
selectedType = std::string("unknown_") + inPrc.name();
|
||||
|
||||
auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(inPrc);
|
||||
auto param = std::make_shared<ngraph::op::Parameter>(ngPrc, inputShape);
|
||||
auto region_yolo = std::make_shared<ngraph::op::v0::RegionYolo>(param, attributes.coordinates, attributes.classes, attributes.num_regions,
|
||||
attributes.do_softmax, mask, attributes.start_axis, attributes.end_axis);
|
||||
function = std::make_shared<ngraph::Function>(std::make_shared<ngraph::opset1::Result>(region_yolo), ngraph::ParameterVector{param}, "RegionYolo");
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(RegionYoloCPULayerTest, CompareWithRefs) {
|
||||
SKIP_IF_CURRENT_TEST_IS_DISABLED()
|
||||
|
||||
Run();
|
||||
CheckCPUImpl(executableNetwork, "RegionYolo");
|
||||
}
|
||||
|
||||
namespace {
|
||||
const std::vector<Precision> inpOutPrc = {Precision::BF16, Precision::FP32};
|
||||
|
||||
const std::map<std::string, std::string> additional_config;
|
||||
|
||||
const std::vector<ngraph::Shape> inShapes_caffe = {
|
||||
{1, 125, 13, 13}
|
||||
};
|
||||
|
||||
const std::vector<ngraph::Shape> inShapes_mxnet = {
|
||||
{1, 75, 52, 52},
|
||||
{1, 75, 32, 32},
|
||||
{1, 75, 26, 26},
|
||||
{1, 75, 16, 16},
|
||||
{1, 75, 13, 13},
|
||||
{1, 75, 8, 8}
|
||||
};
|
||||
|
||||
const std::vector<ngraph::Shape> inShapes_v3 = {
|
||||
{1, 255, 52, 52},
|
||||
{1, 255, 26, 26},
|
||||
{1, 255, 13, 13}
|
||||
};
|
||||
|
||||
const std::vector<std::vector<int64_t>> masks = {
|
||||
{0, 1, 2},
|
||||
{3, 4, 5},
|
||||
{6, 7, 8}
|
||||
};
|
||||
|
||||
const std::vector<bool> do_softmax = {true, false};
|
||||
const std::vector<size_t> classes = {80, 20};
|
||||
const std::vector<size_t> num_regions = {5, 9};
|
||||
const size_t coords = 4;
|
||||
const int start_axis = 1;
|
||||
const int end_axis = 3;
|
||||
|
||||
const regionYoloAttributes yoloV3attr = {80, 4, 9, false, 1, 3};
|
||||
|
||||
const auto testCase_yolov3 = ::testing::Combine(
|
||||
::testing::ValuesIn(inShapes_v3),
|
||||
::testing::Values(yoloV3attr),
|
||||
::testing::Values(masks[2]),
|
||||
::testing::ValuesIn(inpOutPrc),
|
||||
::testing::ValuesIn(inpOutPrc),
|
||||
::testing::Values(additional_config),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)
|
||||
);
|
||||
|
||||
const regionYoloAttributes yoloV3mxnetAttr = {20, 4, 9, false, 1, 3};
|
||||
|
||||
const auto testCase_yolov3_mxnet = ::testing::Combine(
|
||||
::testing::ValuesIn(inShapes_mxnet),
|
||||
::testing::Values(yoloV3mxnetAttr),
|
||||
::testing::Values(masks[1]),
|
||||
::testing::ValuesIn(inpOutPrc),
|
||||
::testing::ValuesIn(inpOutPrc),
|
||||
::testing::Values(additional_config),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)
|
||||
);
|
||||
|
||||
const regionYoloAttributes yoloV2caffeAttr = {20, 4, 5, true, 1, 3};
|
||||
|
||||
const auto testCase_yolov2_caffe = ::testing::Combine(
|
||||
::testing::ValuesIn(inShapes_caffe),
|
||||
::testing::Values(yoloV2caffeAttr),
|
||||
::testing::Values(masks[0]),
|
||||
::testing::ValuesIn(inpOutPrc),
|
||||
::testing::ValuesIn(inpOutPrc),
|
||||
::testing::Values(additional_config),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)
|
||||
);
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_TestsRegionYolov3CPU, RegionYoloCPULayerTest, testCase_yolov3, RegionYoloCPULayerTest::getTestCaseName);
|
||||
INSTANTIATE_TEST_CASE_P(smoke_TestsRegionYoloMxnetCPU, RegionYoloCPULayerTest, testCase_yolov3_mxnet, RegionYoloCPULayerTest::getTestCaseName);
|
||||
INSTANTIATE_TEST_CASE_P(smoke_TestsRegionYoloCaffeCPU, RegionYoloCPULayerTest, testCase_yolov2_caffe, RegionYoloCPULayerTest::getTestCaseName);
|
||||
} // namespace
|
||||
} // namespace CPULayerTestsDefinitions
|
@ -99,7 +99,7 @@ void ConvConcatSubgraphTest::SetUp() {
|
||||
}
|
||||
}
|
||||
for (size_t conv = 0; conv < convolutionNodes.size(); conv++) {
|
||||
convolutionNodes[conv]->get_rt_info() = setCPUInfo(inFmts, outFmts, priority);
|
||||
convolutionNodes[conv]->get_rt_info() = getCPUInfo();
|
||||
}
|
||||
|
||||
auto concat = ngraph::builder::makeConcat(ngraph::OutputVector{convolutionNodes[0], convolutionNodes[1]}, axis);
|
||||
@ -112,7 +112,7 @@ TEST_P(ConvConcatSubgraphTest, CompareWithRefs) {
|
||||
SKIP_IF_CURRENT_TEST_IS_DISABLED()
|
||||
|
||||
Run();
|
||||
CheckCPUImpl(executableNetwork, pluginTypeNode, inFmts, outFmts, selectedType);
|
||||
CheckCPUImpl(executableNetwork, pluginTypeNode);
|
||||
};
|
||||
|
||||
/* ============= Common Convolution Params ============= */
|
||||
|
@ -84,7 +84,7 @@ void FusePermuteAndReorderTest::CreateGraph() {
|
||||
|
||||
auto constOrder = ngraph::builder::makeConstant(ngraph::element::i64, {inputShape.size()}, order);
|
||||
auto permute = std::make_shared<ngraph::opset5::Transpose>(params[0], constOrder);
|
||||
permute->get_rt_info() = setCPUInfo({memFmt}, {memFmt}, {});
|
||||
permute->get_rt_info() = makeCPUInfo({memFmt}, {memFmt}, {});
|
||||
|
||||
ngraph::ResultVector results{std::make_shared<ngraph::opset5::Result>(permute)};
|
||||
function = std::make_shared<ngraph::Function>(results, params, "PermuteReorder");
|
||||
@ -145,17 +145,17 @@ void FusePermuteAndReorderTest1::CreateGraph() {
|
||||
auto constOrder1 = ngraph::builder::makeConstant(ngraph::element::i64, {inputShape.size()}, order);
|
||||
auto permute1 = std::make_shared<ngraph::opset5::Transpose>(params[0], constOrder1);
|
||||
auto memFmt1 = inputShape.size() == 5 ? ndhwc : nhwc;
|
||||
permute1->get_rt_info() = setCPUInfo({memFmt1}, {memFmt1}, {});
|
||||
permute1->get_rt_info() = makeCPUInfo({memFmt1}, {memFmt1}, {});
|
||||
|
||||
auto constOrder2 = ngraph::builder::makeConstant(ngraph::element::i64, {inputShape.size()}, order);
|
||||
auto permute2 = std::make_shared<ngraph::opset5::Transpose>(permute1, constOrder2);
|
||||
auto memFmt2 = inputShape.size() == 5 ? ndhwc : nhwc;
|
||||
permute2->get_rt_info() = setCPUInfo({memFmt2}, {memFmt2}, {});
|
||||
permute2->get_rt_info() = makeCPUInfo({memFmt2}, {memFmt2}, {});
|
||||
|
||||
auto constOrder3 = ngraph::builder::makeConstant(ngraph::element::i64, {inputShape.size()}, order);
|
||||
auto permute3 = std::make_shared<ngraph::opset5::Transpose>(permute2, constOrder3);
|
||||
auto memFmt3 = inputShape.size() == 5 ? ncdhw : nchw;
|
||||
permute3->get_rt_info() = setCPUInfo({memFmt3}, {memFmt3}, {});
|
||||
permute3->get_rt_info() = makeCPUInfo({memFmt3}, {memFmt3}, {});
|
||||
|
||||
auto shape = ngraph::builder::makeConstant(ngraph::element::i64, {inputShape.size()}, permute3->get_output_shape(0));
|
||||
auto reshape = std::make_shared<ngraph::opset5::Reshape>(permute1, shape, false);
|
||||
@ -214,12 +214,12 @@ void FusePermuteAndReorderTest2::CreateGraph() {
|
||||
auto constOrder1 = ngraph::builder::makeConstant(ngraph::element::i64, {inputShape.size()}, order);
|
||||
auto permute1 = std::make_shared<ngraph::opset5::Transpose>(params[0], constOrder1);
|
||||
auto memFmt1 = inputShape.size() == 5 ? ndhwc : nhwc;
|
||||
permute1->get_rt_info() = setCPUInfo({memFmt1}, {memFmt1}, {});
|
||||
permute1->get_rt_info() = makeCPUInfo({memFmt1}, {memFmt1}, {});
|
||||
|
||||
auto constOrder2 = ngraph::builder::makeConstant(ngraph::element::i64, {inputShape.size()}, order);
|
||||
auto permute2 = std::make_shared<ngraph::opset5::Transpose>(params[1], constOrder2);
|
||||
auto memFmt2 = inputShape.size() == 5 ? ncdhw : nchw;
|
||||
permute2->get_rt_info() = setCPUInfo({memFmt2}, {memFmt2}, {});
|
||||
permute2->get_rt_info() = makeCPUInfo({memFmt2}, {memFmt2}, {});
|
||||
|
||||
auto concat = ngraph::builder::makeConcat({permute1, permute2}, 1);
|
||||
|
||||
|
@ -15,6 +15,7 @@ const char *CPUTestsBase::cpu_fmt2str(cpu_memory_format_t v) {
|
||||
if (v == nCdhw8c) return "nCdhw8c";
|
||||
if (v == nCdhw16c) return "nCdhw16c";
|
||||
if (v == ndhwc) return "ndhwc";
|
||||
if (v == nc) return "nc";
|
||||
if (v == x) return "x";
|
||||
assert(!"unknown fmt");
|
||||
return "undef";
|
||||
@ -34,6 +35,7 @@ cpu_memory_format_t CPUTestsBase::cpu_str2fmt(const char *str) {
|
||||
CASE(nCdhw8c);
|
||||
CASE(nCdhw16c);
|
||||
CASE(ndhwc);
|
||||
CASE(nc);
|
||||
CASE(x);
|
||||
#undef CASE
|
||||
assert(!"unknown memory format");
|
||||
@ -45,7 +47,9 @@ std::string CPUTestsBase::fmts2str(const std::vector<cpu_memory_format_t> &fmts)
|
||||
for (auto &fmt : fmts) {
|
||||
((str += "cpu:") += cpu_fmt2str(fmt)) += ",";
|
||||
}
|
||||
str.erase(str.end() - 1);
|
||||
if (!str.empty()) {
|
||||
str.pop_back();
|
||||
}
|
||||
return str;
|
||||
}
|
||||
|
||||
@ -54,14 +58,16 @@ std::string CPUTestsBase::impls2str(const std::vector<std::string> &priority) {
|
||||
for (auto &impl : priority) {
|
||||
((str += "cpu:") += impl) += ",";
|
||||
}
|
||||
str.erase(str.end() - 1);
|
||||
if (!str.empty()) {
|
||||
str.pop_back();
|
||||
}
|
||||
return str;
|
||||
}
|
||||
|
||||
void CPUTestsBase::CheckCPUImpl(InferenceEngine::ExecutableNetwork &execNet, std::string nodeType,
|
||||
std::vector<cpu_memory_format_t> inputMemoryFormats,
|
||||
std::vector<cpu_memory_format_t> outputMemoryFormats, std::string selectedType) {
|
||||
void CPUTestsBase::CheckCPUImpl(InferenceEngine::ExecutableNetwork &execNet, std::string nodeType) const {
|
||||
IE_SUPPRESS_DEPRECATED_START
|
||||
ASSERT_TRUE(!selectedType.empty()) << "Node type is not defined.";
|
||||
bool isNodeFound = false;
|
||||
InferenceEngine::CNNNetwork execGraphInfo = execNet.GetExecGraphInfo();
|
||||
auto function = execGraphInfo.getFunction();
|
||||
ASSERT_NE(nullptr, function);
|
||||
@ -84,25 +90,27 @@ void CPUTestsBase::CheckCPUImpl(InferenceEngine::ExecutableNetwork &execNet, std
|
||||
};
|
||||
|
||||
if (getExecValue(ExecGraphInfoSerialization::LAYER_TYPE) == nodeType) {
|
||||
ASSERT_LE(inputMemoryFormats.size(), node->get_input_size());
|
||||
ASSERT_LE(outputMemoryFormats.size(), node->get_output_size());
|
||||
for (int i = 0; i < inputMemoryFormats.size(); i++) {
|
||||
isNodeFound = true;
|
||||
ASSERT_LE(inFmts.size(), node->get_input_size());
|
||||
ASSERT_LE(outFmts.size(), node->get_output_size());
|
||||
for (int i = 0; i < inFmts.size(); i++) {
|
||||
const auto parentPort = node->input_values()[i];
|
||||
const auto port = node->inputs()[i];
|
||||
if ((parentPort.get_tensor_ptr() == port.get_tensor_ptr())) {
|
||||
auto parentNode = parentPort.get_node_shared_ptr();
|
||||
auto actualInputMemoryFormat = getExecValueOutputsLayout(parentNode);
|
||||
ASSERT_EQ(inputMemoryFormats[i], cpu_str2fmt(actualInputMemoryFormat.c_str()));
|
||||
ASSERT_EQ(inFmts[i], cpu_str2fmt(actualInputMemoryFormat.c_str()));
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < outputMemoryFormats.size(); i++) {
|
||||
for (int i = 0; i < outFmts.size(); i++) {
|
||||
auto actualOutputMemoryFormat = getExecValue(ExecGraphInfoSerialization::OUTPUT_LAYOUTS);
|
||||
ASSERT_EQ(outputMemoryFormats[i], cpu_str2fmt(actualOutputMemoryFormat.c_str()));
|
||||
ASSERT_EQ(outFmts[i], cpu_str2fmt(actualOutputMemoryFormat.c_str()));
|
||||
}
|
||||
auto primType = getExecValue(ExecGraphInfoSerialization::IMPL_TYPE);
|
||||
ASSERT_EQ(selectedType, primType);
|
||||
}
|
||||
}
|
||||
ASSERT_TRUE(isNodeFound) << "Node type name: \"" << nodeType << "\" has not been found.";
|
||||
IE_SUPPRESS_DEPRECATED_END
|
||||
}
|
||||
|
||||
@ -112,16 +120,39 @@ std::string CPUTestsBase::getTestCaseName(CPUSpecificParams params) {
|
||||
std::vector<std::string> priority;
|
||||
std::string selectedType;
|
||||
std::tie(inFmts, outFmts, priority, selectedType) = params;
|
||||
result << "_inFmts=" << fmts2str(inFmts);
|
||||
result << "_outFmts=" << fmts2str(outFmts);
|
||||
result << "_primitive=" << selectedType;
|
||||
if (!inFmts.empty()) {
|
||||
result << "_inFmts=" << fmts2str(inFmts);
|
||||
}
|
||||
if (!outFmts.empty()) {
|
||||
result << "_outFmts=" << fmts2str(outFmts);
|
||||
}
|
||||
if (!selectedType.empty()) {
|
||||
result << "_primitive=" << selectedType;
|
||||
}
|
||||
return result.str();
|
||||
}
|
||||
|
||||
std::map<std::string, std::shared_ptr<ngraph::Variant>> CPUTestsBase::setCPUInfo(std::vector<cpu_memory_format_t> inFmts,
|
||||
std::vector<cpu_memory_format_t> outFmts,
|
||||
std::vector<std::string> priority) {
|
||||
std::map<std::string, std::shared_ptr<ngraph::Variant>> cpuInfo;
|
||||
CPUTestsBase::CPUInfo CPUTestsBase::getCPUInfo() const {
|
||||
return makeCPUInfo(inFmts, outFmts, priority);
|
||||
}
|
||||
|
||||
std::string CPUTestsBase::getPrimitiveType() const {
|
||||
std::string isaType;
|
||||
if (InferenceEngine::with_cpu_x86_avx512f()) {
|
||||
isaType = "jit_avx512";
|
||||
} else if (InferenceEngine::with_cpu_x86_avx2()) {
|
||||
isaType = "jit_avx2";
|
||||
} else if (InferenceEngine::with_cpu_x86_sse42()) {
|
||||
isaType = "jit_sse42";
|
||||
} else {
|
||||
isaType = "ref";
|
||||
}
|
||||
return isaType;
|
||||
}
|
||||
|
||||
CPUTestsBase::CPUInfo
|
||||
CPUTestsBase::makeCPUInfo(std::vector<cpu_memory_format_t> inFmts, std::vector<cpu_memory_format_t> outFmts, std::vector<std::string> priority) {
|
||||
CPUInfo cpuInfo;
|
||||
|
||||
if (!inFmts.empty()) {
|
||||
cpuInfo.insert({"InputMemoryFormats", std::make_shared<ngraph::VariantWrapper<std::string>>(fmts2str(inFmts))});
|
||||
@ -136,4 +167,24 @@ std::map<std::string, std::shared_ptr<ngraph::Variant>> CPUTestsBase::setCPUInfo
|
||||
return cpuInfo;
|
||||
}
|
||||
|
||||
std::vector<CPUSpecificParams> filterCPUSpecificParams(std::vector<CPUSpecificParams> ¶msVector) {
|
||||
auto adjustBlockedFormatByIsa = [](std::vector<cpu_memory_format_t>& formats) {
|
||||
for (int i = 0; i < formats.size(); i++) {
|
||||
if (formats[i] == nChw16c)
|
||||
formats[i] = nChw8c;
|
||||
if (formats[i] == nCdhw16c)
|
||||
formats[i] = nCdhw8c;
|
||||
}
|
||||
};
|
||||
|
||||
if (!InferenceEngine::with_cpu_x86_avx512f()) {
|
||||
for (auto& param : paramsVector) {
|
||||
adjustBlockedFormatByIsa(std::get<0>(param));
|
||||
adjustBlockedFormatByIsa(std::get<1>(param));
|
||||
}
|
||||
}
|
||||
|
||||
return paramsVector;
|
||||
}
|
||||
|
||||
} // namespace CPUTestUtils
|
||||
|
@ -1,4 +1,4 @@
|
||||
// Copyright (C) 2020 Intel Corporation
|
||||
// Copyright (C) 2020 Intel Corporation7
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
@ -23,38 +23,44 @@ namespace CPUTestUtils {
|
||||
nCdhw8c,
|
||||
nCdhw16c,
|
||||
ndhwc,
|
||||
nc,
|
||||
x,
|
||||
undef
|
||||
} cpu_memory_format_t;
|
||||
|
||||
using CPUSpecificParams = std::tuple<
|
||||
std::vector<cpu_memory_format_t>,
|
||||
std::vector<cpu_memory_format_t>,
|
||||
std::vector<std::string>,
|
||||
std::string
|
||||
std::vector<cpu_memory_format_t>, //input memomry format
|
||||
std::vector<cpu_memory_format_t>, //output memory format
|
||||
std::vector<std::string>, //priority
|
||||
std::string // selected primitive type
|
||||
>;
|
||||
|
||||
class CPUTestsBase {
|
||||
public:
|
||||
void CheckCPUImpl(InferenceEngine::ExecutableNetwork &execNet, std::string nodeType, std::vector<cpu_memory_format_t> inputMemoryFormats,
|
||||
std::vector<cpu_memory_format_t> outputMemoryFormats, std::string selectedType);
|
||||
|
||||
std::map<std::string, std::shared_ptr<ngraph::Variant>> setCPUInfo(std::vector<cpu_memory_format_t> inFmts, std::vector<cpu_memory_format_t> outFmts,
|
||||
std::vector<std::string> priority);
|
||||
typedef std::map<std::string, std::shared_ptr<ngraph::Variant>> CPUInfo;
|
||||
|
||||
public:
|
||||
static std::string getTestCaseName(CPUSpecificParams params);
|
||||
static const char *cpu_fmt2str(cpu_memory_format_t v);
|
||||
static cpu_memory_format_t cpu_str2fmt(const char *str);
|
||||
static std::string fmts2str(const std::vector<cpu_memory_format_t> &fmts);
|
||||
static std::string impls2str(const std::vector<std::string> &priority);
|
||||
static CPUInfo makeCPUInfo(std::vector<cpu_memory_format_t> inFmts,
|
||||
std::vector<cpu_memory_format_t> outFmts,
|
||||
std::vector<std::string> priority);
|
||||
|
||||
CPUInfo getCPUInfo() const;
|
||||
void CheckCPUImpl(InferenceEngine::ExecutableNetwork &execNet, std::string nodeType) const;
|
||||
|
||||
protected:
|
||||
std::string getPrimitiveType() const;
|
||||
std::vector<cpu_memory_format_t> inFmts, outFmts;
|
||||
std::vector<std::string> priority;
|
||||
std::string selectedType;
|
||||
|
||||
private:
|
||||
static const char *cpu_fmt2str(cpu_memory_format_t v);
|
||||
cpu_memory_format_t cpu_str2fmt(const char *str);
|
||||
static std::string fmts2str(const std::vector<cpu_memory_format_t> &fmts);
|
||||
std::string impls2str(const std::vector<std::string> &priority);
|
||||
};
|
||||
|
||||
const auto emptyCPUSpec = CPUSpecificParams{{}, {}, {}, {}};
|
||||
|
||||
const auto conv_ref_2D = CPUSpecificParams{{nchw}, {nchw}, {"ref_any"}, "ref_any_FP32"};
|
||||
const auto conv_ref_3D = CPUSpecificParams{{ncdhw}, {ncdhw}, {"ref_any"}, "ref_any_FP32"};
|
||||
|
||||
@ -80,4 +86,7 @@ const auto conv_sse42_2D_1x1 = CPUSpecificParams{{nChw8c}, {nChw8c}, {"jit_sse42
|
||||
const auto conv_avx2_2D_1x1 = CPUSpecificParams{{nChw8c}, {nChw8c}, {"jit_avx2_1x1"}, "jit_avx2_1x1_FP32"};
|
||||
const auto conv_avx512_2D_1x1 = CPUSpecificParams{{nChw16c}, {nChw16c}, {"jit_avx512_1x1"}, "jit_avx512_1x1_FP32"};
|
||||
|
||||
// utility functions
|
||||
std::vector<CPUSpecificParams> filterCPUSpecificParams(std::vector<CPUSpecificParams>& paramsVector);
|
||||
|
||||
} // namespace CPUTestUtils
|
||||
|
@ -36,5 +36,9 @@ std::vector<std::string> disabledTestPatterns() {
|
||||
// TODO: Issue: 41461
|
||||
R"(.*TopKLayerTest.*k=10.*mode=min.*sort=index.*)",
|
||||
R"(.*TopKLayerTest.*k=5.*sort=(none|index).*)",
|
||||
// TODO: Issue: 43511
|
||||
R"(.*EltwiseLayerTest.*IS=\(1.4.3.2.1.3\).*OpType=(Prod|Sub).*secondaryInputType=CONSTANT_opType=VECTOR_netPRC=(FP16|FP32).*)",
|
||||
R"(.*EltwiseLayerTest.*IS=\(1.4.3.2.1.3\).*OpType=Sum.*secondaryInputType=CONSTANT_opType=VECTOR_netPRC=(FP16|FP32).*)",
|
||||
R"(.*EltwiseLayerTest.*IS=\(1.4.3.2.1.3\).*OpType=Sub.*secondaryInputType=CONSTANT_opType=VECTOR_netPRC=I64.*)",
|
||||
};
|
||||
}
|
||||
|
@ -98,13 +98,11 @@ void EltwiseLayerTest::SetUp() {
|
||||
eltwiseType == ngraph::helpers::EltwiseTypes::FLOOR_MOD ||
|
||||
eltwiseType == ngraph::helpers::EltwiseTypes::MOD) {
|
||||
std::vector<float> data(ngraph::shape_size(shape_input_secondary));
|
||||
data = NGraphFunctions::Utils::generateVector<ngraph::element::Type_t::f32>(ngraph::shape_size(shape_input_secondary));
|
||||
for (float &i : data) {
|
||||
if (i == 0) {
|
||||
i = 1;
|
||||
}
|
||||
}
|
||||
data = NGraphFunctions::Utils::generateVector<ngraph::element::Type_t::f32>(ngraph::shape_size(shape_input_secondary), 10, 2);
|
||||
secondaryInput = ngraph::builder::makeConstant(ngPrc, shape_input_secondary, data);
|
||||
} else if (eltwiseType == ngraph::helpers::EltwiseTypes::POWER && secondaryInputType == ngraph::helpers::InputLayerType::CONSTANT) {
|
||||
// to avoid floating point overflow on some platforms, let's fill the constant with small numbers.
|
||||
secondaryInput = ngraph::builder::makeConstant<float>(ngPrc, shape_input_secondary, {}, true, 3);
|
||||
} else {
|
||||
secondaryInput = ngraph::builder::makeInputLayer(ngPrc, secondaryInputType, shape_input_secondary);
|
||||
if (secondaryInputType == ngraph::helpers::InputLayerType::PARAMETER) {
|
||||
|
@ -27,7 +27,7 @@ std::string RegionYoloLayerTest::getTestCaseName(const testing::TestParamInfo<re
|
||||
std::string targetName;
|
||||
std::tie(inputShape, classes, coords, num_regions, do_softmax , mask, start_axis, end_axis, netPrecision, targetName) = obj.param;
|
||||
std::ostringstream result;
|
||||
result << "IS=" << inputShape << "_";
|
||||
result << "IS=" << CommonTestUtils::vec2str(inputShape) << "_";
|
||||
result << "classes=" << classes << "_";
|
||||
result << "coords=" << coords << "_";
|
||||
result << "num=" << num_regions << "_";
|
||||
@ -51,7 +51,7 @@ void RegionYoloLayerTest::SetUp() {
|
||||
InferenceEngine::Precision netPrecision;
|
||||
std::tie(inputShape, classes, coords, num_regions, do_softmax, mask, start_axis, end_axis, netPrecision, targetDevice) = this->GetParam();
|
||||
auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
|
||||
auto param = std::make_shared<ngraph::op::Parameter>(ngraph::element::f32, inputShape);
|
||||
auto param = std::make_shared<ngraph::op::Parameter>(ngPrc, inputShape);
|
||||
auto region_yolo = std::make_shared<ngraph::op::v0::RegionYolo>(param, coords, classes, num_regions, do_softmax, mask, start_axis, end_axis);
|
||||
function = std::make_shared<ngraph::Function>(std::make_shared<ngraph::opset1::Result>(region_yolo), ngraph::ParameterVector{param}, "RegionYolo");
|
||||
}
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include <utility>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <ngraph/type/bfloat16.hpp>
|
||||
#include <ngraph/type/float16.hpp>
|
||||
|
||||
#include <ie_blob.h>
|
||||
@ -177,8 +178,10 @@ void inline fill_data_random_float(InferenceEngine::Blob::Ptr &blob, const uint3
|
||||
for (size_t i = 0; i < blob->size(); i++) {
|
||||
auto value = static_cast<float>(distribution(random));
|
||||
value /= static_cast<float>(k);
|
||||
if (typeid(dataType) == typeid(typename InferenceEngine::PrecisionTrait<InferenceEngine::Precision::FP16>::value_type)) {
|
||||
if (PRC == InferenceEngine::Precision::FP16) {
|
||||
rawBlobDataPtr[i] = ngraph::float16(value).to_bits();
|
||||
} else if (PRC == InferenceEngine::Precision::BF16) {
|
||||
rawBlobDataPtr[i] = ngraph::bfloat16(value).to_bits();
|
||||
} else {
|
||||
rawBlobDataPtr[i] = value;
|
||||
}
|
||||
@ -237,4 +240,27 @@ void inline fill_data_random<InferenceEngine::Precision::FP16>(InferenceEngine::
|
||||
fill_data_random_float<InferenceEngine::Precision::FP16>(blob, range, start_from, k, seed);
|
||||
}
|
||||
|
||||
template<>
|
||||
void inline fill_data_random<InferenceEngine::Precision::BF16>(InferenceEngine::Blob::Ptr &blob,
|
||||
const uint32_t range,
|
||||
int32_t start_from,
|
||||
const int32_t k, const int seed) {
|
||||
fill_data_random_float<InferenceEngine::Precision::BF16>(blob, range, start_from, k, seed);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
typename std::enable_if<std::is_signed<T>::value, T>::type
|
||||
static ie_abs(const T &val) {
|
||||
return std::abs(val);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
typename std::enable_if<std::is_unsigned<T>::value, T>::type
|
||||
static ie_abs(const T &val) {
|
||||
return val;
|
||||
}
|
||||
|
||||
static ngraph::bfloat16 ie_abs(const ngraph::bfloat16& val) {
|
||||
return ngraph::bfloat16::from_bits(val.to_bits() ^ 0x8000);
|
||||
}
|
||||
} // namespace CommonTestUtils
|
||||
|
@ -328,6 +328,16 @@ convertArrayPrecision<InferenceEngine::Precision::FP16, InferenceEngine::Precisi
|
||||
InferenceEngine::PrecisionUtils::f16tof32Arrays(dst, src, nelem, 1.0f, 0.0f);
|
||||
}
|
||||
|
||||
template<>
|
||||
void inline
|
||||
convertArrayPrecision<InferenceEngine::Precision::BF16, InferenceEngine::Precision::FP32>(float *dst, const short *src,
|
||||
size_t nelem) {
|
||||
auto srcBf16 = reinterpret_cast<const ngraph::bfloat16*>(src);
|
||||
for (size_t i = 0; i < nelem; i++) {
|
||||
dst[i] = static_cast<float>(srcBf16[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template<InferenceEngine::Precision::ePrecision PREC_FROM, InferenceEngine::Precision::ePrecision PREC_TO>
|
||||
InferenceEngine::Blob::Ptr inline convertBlobPrecision(const InferenceEngine::Blob::Ptr &blob) {
|
||||
using from_d_type = typename InferenceEngine::PrecisionTrait<PREC_FROM>::value_type;
|
||||
@ -464,6 +474,7 @@ InferenceEngine::Blob::Ptr inline createAndFillBlob(const InferenceEngine::Tenso
|
||||
#define CASE(X) case X: CommonTestUtils::fill_data_random<X>(blob, range, start_from, resolution, seed); break;
|
||||
CASE(InferenceEngine::Precision::FP32)
|
||||
CASE(InferenceEngine::Precision::FP16)
|
||||
CASE(InferenceEngine::Precision::BF16)
|
||||
CASE(InferenceEngine::Precision::U8)
|
||||
CASE(InferenceEngine::Precision::U16)
|
||||
CASE(InferenceEngine::Precision::I8)
|
||||
|
@ -239,6 +239,10 @@ void LayerTestsCommon::Compare(const std::vector<std::uint8_t> &expected, const
|
||||
Compare<uint64_t>(reinterpret_cast<const uint64_t *>(expectedBuffer),
|
||||
reinterpret_cast<const uint64_t *>(actualBuffer), size, 0);
|
||||
break;
|
||||
case InferenceEngine::Precision::BF16:
|
||||
Compare(reinterpret_cast<const ngraph::bfloat16 *>(expectedBuffer),
|
||||
reinterpret_cast<const ngraph::bfloat16 *>(actualBuffer), size, ngraph::bfloat16(threshold));
|
||||
break;
|
||||
default:
|
||||
FAIL() << "Comparator for " << precision << " precision isn't supported";
|
||||
}
|
||||
@ -320,6 +324,9 @@ std::vector<std::vector<std::uint8_t>> LayerTestsCommon::CalculateRefs() {
|
||||
// IE converts f16 to f32
|
||||
ngraph::pass::ConvertPrecision<ngraph::element::Type_t::f16, ngraph::element::Type_t::f32>().run_on_function(
|
||||
function);
|
||||
|
||||
// The same idea for bf16
|
||||
ngraph::pass::ConvertPrecision<ngraph::element::Type_t::bf16, ngraph::element::Type_t::f32>().run_on_function(function);
|
||||
function->validate_nodes_and_infer_types();
|
||||
auto referenceInputs = std::vector<std::vector<std::uint8_t>>(inputs.size());
|
||||
for (std::size_t i = 0; i < inputs.size(); ++i) {
|
||||
@ -347,14 +354,15 @@ std::vector<std::vector<std::uint8_t>> LayerTestsCommon::CalculateRefs() {
|
||||
}
|
||||
}
|
||||
|
||||
const auto& inType = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(inPrc);
|
||||
std::vector<std::vector<std::uint8_t>> expectedOutputs;
|
||||
switch (refMode) {
|
||||
case INTERPRETER: {
|
||||
expectedOutputs = ngraph::helpers::interpreterFunction(function, referenceInputs, convertType);
|
||||
expectedOutputs = ngraph::helpers::interpreterFunction(function, referenceInputs, inType, convertType);
|
||||
break;
|
||||
}
|
||||
case CONSTANT_FOLDING: {
|
||||
const auto &foldedFunc = ngraph::helpers::foldFunction(function, referenceInputs);
|
||||
const auto &foldedFunc = ngraph::helpers::foldFunction(function, referenceInputs, inType);
|
||||
expectedOutputs = ngraph::helpers::getConstData(foldedFunc, convertType);
|
||||
break;
|
||||
}
|
||||
@ -370,7 +378,7 @@ std::vector<std::vector<std::uint8_t>> LayerTestsCommon::CalculateRefs() {
|
||||
m.register_pass<ngraph::pass::ConvertSpaceToBatch>();
|
||||
m.register_pass<ngraph::pass::ConvertBatchToSpace>();
|
||||
m.run_passes(cloned_function);
|
||||
expectedOutputs = ngraph::helpers::interpreterFunction(cloned_function, referenceInputs, convertType);
|
||||
expectedOutputs = ngraph::helpers::interpreterFunction(cloned_function, referenceInputs, inType, convertType);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -15,6 +15,7 @@
|
||||
#include <ie_plugin_config.hpp>
|
||||
#include <ngraph/function.hpp>
|
||||
#include <ngraph/pass/manager.hpp>
|
||||
#include <ngraph/type/bfloat16.hpp>
|
||||
|
||||
#include "common_test_utils/common_utils.hpp"
|
||||
#include "common_test_utils/test_common.hpp"
|
||||
@ -154,29 +155,17 @@ public:
|
||||
protected:
|
||||
LayerTestsCommon();
|
||||
|
||||
template<typename T>
|
||||
typename std::enable_if<std::is_signed<T>::value, T>::type
|
||||
static ie_abs(const T &val) {
|
||||
return std::abs(val);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
typename std::enable_if<std::is_unsigned<T>::value, T>::type
|
||||
static ie_abs(const T &val) {
|
||||
return val;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
static void Compare(const T *expected, const T *actual, std::size_t size, T threshold) {
|
||||
for (std::size_t i = 0; i < size; ++i) {
|
||||
const auto &ref = expected[i];
|
||||
const auto &res = actual[i];
|
||||
const auto absoluteDifference = ie_abs(res - ref);
|
||||
const auto absoluteDifference = CommonTestUtils::ie_abs(res - ref);
|
||||
if (absoluteDifference <= threshold) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const auto max = std::max(ie_abs(res), ie_abs(ref));
|
||||
const auto max = std::max(CommonTestUtils::ie_abs(res), CommonTestUtils::ie_abs(ref));
|
||||
ASSERT_TRUE(max != 0 && ((absoluteDifference / max) <= threshold))
|
||||
<< "Relative comparison of values expected: " << ref << " and actual: " << res
|
||||
<< " at index " << i << " with threshold " << threshold
|
||||
|
@ -233,6 +233,7 @@ inline ngraph::NodeVector castOps2Nodes(const std::vector<std::shared_ptr<opType
|
||||
|
||||
std::vector<std::vector<std::uint8_t>> interpreterFunction(const std::shared_ptr<Function> &function,
|
||||
const std::vector<std::vector<std::uint8_t>> &inputs,
|
||||
element::Type_t inType = element::Type_t::undefined,
|
||||
const std::vector<ngraph::element::Type_t> convertType = {});
|
||||
|
||||
//
|
||||
@ -245,7 +246,8 @@ void CompareFunctions(const Function &actual, const Function &expected);
|
||||
|
||||
|
||||
std::shared_ptr<Function> foldFunction(const std::shared_ptr<Function> &function,
|
||||
const std::vector<std::vector<std::uint8_t>> &inputs);
|
||||
const std::vector<std::vector<std::uint8_t>> &inputs,
|
||||
element::Type_t inpType = element::Type_t::undefined);
|
||||
|
||||
std::vector<std::vector<std::uint8_t>> getConstData(const std::shared_ptr<Function> &function,
|
||||
std::vector<ngraph::element::Type_t> convertType = {});
|
||||
@ -253,7 +255,7 @@ std::vector<std::vector<std::uint8_t>> getConstData(const std::shared_ptr<Functi
|
||||
std::shared_ptr<ngraph::Node> getNodeSharedPtr(const ngraph::NodeTypeInfo &type_info,
|
||||
const ngraph::OutputVector &outputVector);
|
||||
|
||||
std::vector<std::uint8_t> convertOutputPrecision(std::vector<std::uint8_t> &output,
|
||||
std::vector<std::uint8_t> convertOutputPrecision(const std::vector<std::uint8_t> &output,
|
||||
const element::Type_t &fromPrecision,
|
||||
const element::Type_t &toPrecision,
|
||||
const size_t elementsCount);
|
||||
|
@ -17,8 +17,7 @@ std::shared_ptr<ngraph::Node> makeInputLayer(const element::Type &type, ngraph::
|
||||
std::shared_ptr<ngraph::Node> input;
|
||||
switch (inputType) {
|
||||
case ngraph::helpers::InputLayerType::CONSTANT: {
|
||||
std::vector<float> data(ngraph::shape_size(shape));
|
||||
input = ngraph::builder::makeConstant(type, shape, data);
|
||||
input = ngraph::builder::makeConstant<float>(type, shape, {}, true);
|
||||
break;
|
||||
}
|
||||
case ngraph::helpers::InputLayerType::PARAMETER:
|
||||
|
@ -79,7 +79,9 @@ OutputVector convert2OutputVector(const std::vector<std::shared_ptr<Node>> &node
|
||||
return outs;
|
||||
}
|
||||
|
||||
std::vector<std::vector<std::uint8_t>> interpreterFunction(const std::shared_ptr<Function> &function, const std::vector<std::vector<std::uint8_t>> &inputs,
|
||||
std::vector<std::vector<std::uint8_t>> interpreterFunction(const std::shared_ptr<Function> &function,
|
||||
const std::vector<std::vector<std::uint8_t>> &inputs,
|
||||
element::Type_t inType,
|
||||
const std::vector<ngraph::element::Type_t> convertType) {
|
||||
runtime::Backend::set_backend_shared_library_search_directory("");
|
||||
auto backend = runtime::Backend::create("INTERPRETER");
|
||||
@ -98,7 +100,12 @@ std::vector<std::vector<std::uint8_t>> interpreterFunction(const std::shared_ptr
|
||||
const auto ¶meterType = parameter->get_element_type();
|
||||
const auto ¶meterSize = shape_size(parameterShape) * parameterType.size();
|
||||
|
||||
const auto &input = inputs[parameterIndex];
|
||||
auto input = inputs[parameterIndex];
|
||||
|
||||
if (inType != element::undefined && inType != parameterType) {
|
||||
input = convertOutputPrecision(input, inType, parameter->get_element_type(), shape_size(parameter->get_shape()));
|
||||
}
|
||||
|
||||
const auto &inputSize = input.size();
|
||||
NGRAPH_CHECK(parameterSize == inputSize,
|
||||
"Got parameter (", parameter->get_friendly_name(), ") of size ", parameterSize,
|
||||
@ -137,21 +144,30 @@ std::vector<std::vector<std::uint8_t>> interpreterFunction(const std::shared_ptr
|
||||
}
|
||||
|
||||
std::shared_ptr<Function> foldFunction(const std::shared_ptr<Function> &function,
|
||||
const std::vector<std::vector<std::uint8_t>> &inputs) {
|
||||
const std::vector<std::vector<std::uint8_t>> &inputs, element::Type_t inpType) {
|
||||
std::vector<element::Type> paramElementTypes;
|
||||
std::vector<PartialShape> paramShapes;
|
||||
std::vector<std::vector<std::uint8_t>> vecTmpConvertedInputs;
|
||||
vecTmpConvertedInputs.reserve(inputs.size());
|
||||
|
||||
std::vector<void *> inBuffers;
|
||||
inBuffers.reserve(inputs.size());
|
||||
|
||||
for (const auto ¶m : function->get_parameters()) {
|
||||
paramElementTypes.emplace_back(param->get_element_type());
|
||||
paramShapes.emplace_back(param->get_shape());
|
||||
}
|
||||
auto parameterIndex = function->get_parameter_index(param);
|
||||
auto& input = inputs[parameterIndex];
|
||||
|
||||
auto inBuffers = std::vector<void *>(inputs.size());
|
||||
std::transform(inputs.cbegin(), inputs.cend(), inBuffers.begin(),
|
||||
[](const std::vector<std::uint8_t> &input) {
|
||||
// const_cast added to satisfy specialize_function interface
|
||||
// which requires inputs as std::vector<void *>
|
||||
return const_cast<std::uint8_t *>(input.data());
|
||||
});
|
||||
if (inpType != element::undefined && inpType != paramElementTypes.back()) {
|
||||
vecTmpConvertedInputs.emplace_back(convertOutputPrecision(input, inpType, param->get_element_type(), shape_size(param->get_shape())));
|
||||
inBuffers.push_back(vecTmpConvertedInputs.back().data());
|
||||
} else {
|
||||
// const_cast added to satisfy specialize_function interface
|
||||
// which requires inputs as std::vector<void *>
|
||||
inBuffers.push_back(const_cast<std::uint8_t *>(input.data()));
|
||||
}
|
||||
}
|
||||
|
||||
const auto &foldedFunc = specialize_function(function, paramElementTypes, paramShapes, inBuffers);
|
||||
ngraph::pass::ConstantFolding().run_on_function(foldedFunc);
|
||||
@ -250,7 +266,7 @@ std::shared_ptr<ngraph::Node> getNodeSharedPtr(const ngraph::NodeTypeInfo &type_
|
||||
}
|
||||
|
||||
template <typename fromPrec, typename toPrec>
|
||||
std::vector<std::uint8_t> convertPrecision(std::vector<std::uint8_t> &buffer, const size_t elementsCount, const size_t elementSize) {
|
||||
std::vector<std::uint8_t> convertPrecision(const std::vector<std::uint8_t> &buffer, const size_t elementsCount, const size_t elementSize) {
|
||||
std::vector<std::uint8_t> convertedData(elementsCount * elementSize);
|
||||
const fromPrec *src = reinterpret_cast<const fromPrec *>(buffer.data());
|
||||
toPrec *dst = reinterpret_cast<toPrec *>(convertedData.data());
|
||||
@ -270,8 +286,10 @@ bool is_tensor_iterator_exist(const std::shared_ptr<ngraph::Function> & func) {
|
||||
return false;
|
||||
}
|
||||
|
||||
std::vector<std::uint8_t> convertOutputPrecision(std::vector<std::uint8_t> &output, const element::Type_t &fromPrecision, const element::Type_t &toPrecision,
|
||||
const size_t elementsCount) {
|
||||
std::vector<std::uint8_t> convertOutputPrecision(const std::vector<std::uint8_t> &output,
|
||||
const element::Type_t &fromPrecision,
|
||||
const element::Type_t &toPrecision,
|
||||
const size_t elementsCount) {
|
||||
switch (fromPrecision) {
|
||||
case element::Type_t::u8: {
|
||||
switch (toPrecision) {
|
||||
@ -520,6 +538,12 @@ std::vector<std::uint8_t> convertOutputPrecision(std::vector<std::uint8_t> &outp
|
||||
case element::Type_t::u64: {
|
||||
return convertPrecision<float, uint64_t>(output, elementsCount, element::Type(toPrecision).size());
|
||||
}
|
||||
case element::Type_t::bf16: {
|
||||
return convertPrecision<float, ngraph::bfloat16>(output, elementsCount, element::Type(toPrecision).size());
|
||||
}
|
||||
case element::Type_t::boolean: {
|
||||
return convertPrecision<float, char>(output, elementsCount, element::Type(toPrecision).size());
|
||||
}
|
||||
default:
|
||||
throw std::runtime_error("convertOutputPrecision can't convert from: " + element::Type(fromPrecision).get_type_name() + " to: " +
|
||||
element::Type(toPrecision).get_type_name());
|
||||
@ -548,6 +572,9 @@ std::vector<std::uint8_t> convertOutputPrecision(std::vector<std::uint8_t> &outp
|
||||
case element::Type_t::f32: {
|
||||
return convertPrecision<char, float>(output, elementsCount, element::Type(toPrecision).size());
|
||||
}
|
||||
case element::Type_t::bf16: {
|
||||
return convertPrecision<char, ngraph::bfloat16>(output, elementsCount, element::Type(toPrecision).size());
|
||||
}
|
||||
case element::Type_t::u64: {
|
||||
return convertPrecision<char, uint64_t>(output, elementsCount, element::Type(toPrecision).size());
|
||||
}
|
||||
@ -556,6 +583,43 @@ std::vector<std::uint8_t> convertOutputPrecision(std::vector<std::uint8_t> &outp
|
||||
element::Type(toPrecision).get_type_name());
|
||||
}
|
||||
}
|
||||
case element::Type_t::bf16: {
|
||||
switch (toPrecision) {
|
||||
case element::Type_t::u8: {
|
||||
return convertPrecision<ngraph::bfloat16, uint8_t>(output, elementsCount, element::Type(toPrecision).size());
|
||||
}
|
||||
case element::Type_t::u16: {
|
||||
return convertPrecision<ngraph::bfloat16, uint16_t>(output, elementsCount, element::Type(toPrecision).size());
|
||||
}
|
||||
case element::Type_t::i8: {
|
||||
return convertPrecision<ngraph::bfloat16, int8_t>(output, elementsCount, element::Type(toPrecision).size());
|
||||
}
|
||||
case element::Type_t::i16: {
|
||||
return convertPrecision<ngraph::bfloat16, int16_t>(output, elementsCount, element::Type(toPrecision).size());
|
||||
}
|
||||
case element::Type_t::i32: {
|
||||
return convertPrecision<ngraph::bfloat16, int32_t>(output, elementsCount, element::Type(toPrecision).size());
|
||||
}
|
||||
case element::Type_t::i64: {
|
||||
return convertPrecision<ngraph::bfloat16, int64_t>(output, elementsCount, element::Type(toPrecision).size());
|
||||
}
|
||||
case element::Type_t::f32: {
|
||||
return convertPrecision<ngraph::bfloat16, float>(output, elementsCount, element::Type(toPrecision).size());
|
||||
}
|
||||
case element::Type_t::u64: {
|
||||
return convertPrecision<ngraph::bfloat16, uint64_t>(output, elementsCount, element::Type(toPrecision).size());
|
||||
}
|
||||
case element::Type_t::bf16: {
|
||||
return convertPrecision<ngraph::bfloat16, ngraph::bfloat16>(output, elementsCount, element::Type(toPrecision).size());
|
||||
}
|
||||
case element::Type_t::boolean: {
|
||||
return convertPrecision<ngraph::bfloat16, char>(output, elementsCount, element::Type(toPrecision).size());
|
||||
}
|
||||
default:
|
||||
throw std::runtime_error("convertOutputPrecision can't convert from: " + element::Type(fromPrecision).get_type_name() + " to: " +
|
||||
element::Type(toPrecision).get_type_name());
|
||||
}
|
||||
}
|
||||
default:
|
||||
throw std::runtime_error("convertOutputPrecision can't convert from: " + element::Type(fromPrecision).get_type_name() + " precision");
|
||||
}
|
||||
|
@ -1,247 +0,0 @@
|
||||
// Copyright (C) 2018-2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <ie_core.hpp>
|
||||
|
||||
#include "tests_common.hpp"
|
||||
#include "single_layer_common.hpp"
|
||||
#include "ir_gen_helper.hpp"
|
||||
|
||||
using namespace ::testing;
|
||||
using namespace InferenceEngine;
|
||||
using namespace single_layer_tests;
|
||||
|
||||
struct crop_base_params {
|
||||
std::vector<size_t> in_dims;
|
||||
std::vector<size_t> out_dims;
|
||||
std::vector<size_t> offsets;
|
||||
};
|
||||
|
||||
#ifdef IN
|
||||
#undef IN
|
||||
#endif
|
||||
|
||||
struct crop_test_params : crop_base_params {
|
||||
std::string device_name;
|
||||
|
||||
crop_test_params(std::string name, crop_base_params params) :
|
||||
crop_base_params(params), device_name(name) {}
|
||||
};
|
||||
|
||||
template <typename data_t>
|
||||
void ref_crop(InferenceEngine::TBlob<data_t> &src, InferenceEngine::TBlob<data_t> &dst, crop_test_params prm) {
|
||||
data_t *dst_ptr = dst.data();
|
||||
|
||||
int ndims = prm.in_dims.size();
|
||||
|
||||
size_t OFFSET_N = prm.offsets.at(0);
|
||||
size_t OFFSET_C = prm.offsets.at(1);
|
||||
size_t OFFSET_D = ndims == 5 ? prm.offsets.at(ndims - 3) : 0;
|
||||
size_t OFFSET_H = prm.offsets.at(ndims - 2);
|
||||
size_t OFFSET_W = prm.offsets.at(ndims - 1);
|
||||
|
||||
size_t ON = prm.out_dims[0];
|
||||
size_t OC = prm.out_dims[1];
|
||||
size_t OD = ndims == 5 ? prm.out_dims[ndims - 3] : 1;
|
||||
size_t OH = prm.out_dims[ndims - 2];
|
||||
size_t OW = prm.out_dims[ndims - 1];
|
||||
|
||||
size_t IN = prm.in_dims[0];
|
||||
size_t IC = prm.in_dims[1];
|
||||
size_t ID = ndims == 5 ? prm.in_dims[ndims - 3] : 1;
|
||||
size_t IH = prm.in_dims[ndims - 2];
|
||||
size_t IW = prm.in_dims[ndims - 1];
|
||||
|
||||
auto dst_off = [=](size_t n, size_t c, size_t d, size_t h, size_t w) -> size_t {
|
||||
return (n * OC * OD * OH * OW + c * OD * OH * OW + d * OH * OW + h * OW + w);
|
||||
};
|
||||
auto src_off = [=](size_t n, size_t c, size_t d, size_t h, size_t w) -> size_t {
|
||||
return (n * IC * ID * IH * IW + c * ID * IH * IW + d * IH * IW + h * IW + w);
|
||||
};
|
||||
|
||||
ASSERT_GE(IN - OFFSET_N, ON);
|
||||
ASSERT_GE(IC - OFFSET_C, OC);
|
||||
ASSERT_GE(ID - OFFSET_D, OD);
|
||||
ASSERT_GE(IH - OFFSET_H, OH);
|
||||
ASSERT_GE(IW - OFFSET_W, OW);
|
||||
|
||||
data_t* src_ptr = src.data();
|
||||
for (size_t n = 0; n < ON; ++n) {
|
||||
for (size_t c = 0; c < OC; ++c) {
|
||||
for (size_t d = 0; d < OD; ++d) {
|
||||
for (size_t h = 0; h < OH; ++h) {
|
||||
for (size_t w = 0; w < OW; ++w) {
|
||||
dst_ptr[dst_off(n, c, d, h, w)] = src_ptr[src_off(n + OFFSET_N, c + OFFSET_C, d + OFFSET_D,
|
||||
h + OFFSET_H, w + OFFSET_W)];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class smoke_CropOnlyTest: public TestsCommon,
|
||||
public WithParamInterface<crop_test_params> {
|
||||
std::string layers_t = R"V0G0N(
|
||||
<layer name="crop" id="1" type="Crop" precision="FP32">
|
||||
<crop-data>
|
||||
<crop axis="0" offset="_OF0_" dim="_OD0_" />
|
||||
<crop axis="1" offset="_OF1_" dim="_OD1_" />
|
||||
<crop axis="2" offset="_OF2_" dim="_OD2_" />
|
||||
<crop axis="3" offset="_OF3_" dim="_OD3_" />
|
||||
<crop axis="4" offset="_OF4_" dim="_OD4_" />
|
||||
</crop-data>
|
||||
<input>
|
||||
<port id="0">
|
||||
<dim>_ID0_</dim>
|
||||
<dim>_ID1_</dim>
|
||||
<dim>_ID2_</dim>
|
||||
<dim>_ID3_</dim>
|
||||
<dim>_ID4_</dim>
|
||||
</port>
|
||||
</input>
|
||||
<output>
|
||||
<port id="1">
|
||||
<dim>_OD0_</dim>
|
||||
<dim>_OD1_</dim>
|
||||
<dim>_OD2_</dim>
|
||||
<dim>_OD3_</dim>
|
||||
<dim>_OD4_</dim>
|
||||
</port>
|
||||
</output>
|
||||
</layer>
|
||||
)V0G0N";
|
||||
|
||||
std::string edges_t = R"V0G0N(
|
||||
<edge from-layer="0" from-port="0" to-layer="1" to-port="0"/>
|
||||
)V0G0N";
|
||||
|
||||
std::string getModel(crop_test_params p) {
|
||||
std::string model = layers_t;
|
||||
|
||||
auto dims_size = p.in_dims.size();
|
||||
|
||||
if (dims_size == 4) {
|
||||
REMOVE_LINE(model, "<crop axis=\"4\" offset=\"_OF4_\" dim=\"_OD4_\" />");
|
||||
REMOVE_LINE(model, "<dim>_ID4_</dim>");
|
||||
REMOVE_LINE(model, "<dim>_OD4_</dim>");
|
||||
}
|
||||
|
||||
REPLACE_WITH_NUM(model, "_ID0_", p.in_dims[0]);
|
||||
REPLACE_WITH_NUM(model, "_ID1_", p.in_dims[1]);
|
||||
REPLACE_WITH_NUM(model, "_ID2_", p.in_dims[2]);
|
||||
REPLACE_WITH_NUM(model, "_ID3_", p.in_dims[3]);
|
||||
if (dims_size == 5)
|
||||
REPLACE_WITH_NUM(model, "_ID4_", p.in_dims[4]);
|
||||
|
||||
REPLACE_WITH_NUM(model, "_OD0_", p.out_dims[0]);
|
||||
REPLACE_WITH_NUM(model, "_OD1_", p.out_dims[1]);
|
||||
REPLACE_WITH_NUM(model, "_OD2_", p.out_dims[2]);
|
||||
REPLACE_WITH_NUM(model, "_OD3_", p.out_dims[3]);
|
||||
if (dims_size == 5)
|
||||
REPLACE_WITH_NUM(model, "_OD4_", p.out_dims[4]);
|
||||
|
||||
REPLACE_WITH_NUM(model, "_OF0_", p.offsets[0]);
|
||||
REPLACE_WITH_NUM(model, "_OF1_", p.offsets[1]);
|
||||
REPLACE_WITH_NUM(model, "_OF2_", p.offsets[2]);
|
||||
REPLACE_WITH_NUM(model, "_OF3_", p.offsets[3]);
|
||||
if (dims_size == 5)
|
||||
REPLACE_WITH_NUM(model, "_OF4_", p.offsets[4]);
|
||||
|
||||
model = IRTemplateGenerator::getIRTemplate("Crop_Only", p.in_dims, "FP32", model, edges_t);
|
||||
|
||||
return model;
|
||||
}
|
||||
|
||||
protected:
|
||||
virtual void SetUp() {
|
||||
try {
|
||||
crop_test_params p = ::testing::WithParamInterface<crop_test_params>::GetParam();
|
||||
std::string model = getModel(p);
|
||||
|
||||
Core ie;
|
||||
CNNNetwork network = ie.ReadNetwork(model, Blob::CPtr());
|
||||
|
||||
InferenceEngine::Layout layout = InferenceEngine::ANY;
|
||||
switch (p.in_dims.size()) {
|
||||
case 4: layout = InferenceEngine::NCHW; break;
|
||||
case 5: layout = InferenceEngine::NCDHW; break;
|
||||
}
|
||||
|
||||
InputsDataMap inputs = network.getInputsInfo();
|
||||
DataPtr inPtr1 = inputs["in1"]->getInputData();
|
||||
|
||||
InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float>(inPtr1->getTensorDesc());
|
||||
src->allocate();
|
||||
fill_data(src->buffer(), src->size());
|
||||
|
||||
TBlob<float>* srcPtr = dynamic_cast<TBlob<float>*>(src.get());
|
||||
BlobMap srcs;
|
||||
srcs.insert(std::pair<std::string, Blob::Ptr>("in1", src));
|
||||
|
||||
OutputsDataMap out = network.getOutputsInfo();
|
||||
BlobMap dstBlobs;
|
||||
std::pair<std::string, DataPtr> item = *out.begin();
|
||||
TBlob<float>::Ptr dst;
|
||||
dst = make_shared_blob<float>(item.second->getTensorDesc());
|
||||
dst->allocate();
|
||||
dstBlobs[item.first] = dst;
|
||||
|
||||
TBlob<float>::Ptr dst_ref;
|
||||
dst_ref = make_shared_blob<float>(item.second->getTensorDesc());
|
||||
dst_ref->allocate();
|
||||
|
||||
ref_crop(*srcPtr, *dst_ref, p);
|
||||
|
||||
ExecutableNetwork exeNetwork = ie.LoadNetwork(network, p.device_name);
|
||||
InferRequest inferRequest = exeNetwork.CreateInferRequest();
|
||||
inferRequest.SetInput(srcs);
|
||||
inferRequest.SetOutput(dstBlobs);
|
||||
inferRequest.Infer();
|
||||
|
||||
compare(*dstBlobs.begin()->second, *dst_ref);
|
||||
|
||||
} catch (const details::InferenceEngineException &e) {
|
||||
FAIL() << e.what();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#define case_1 crop_base_params({{1, 5, 32, 32}, {1, 2, 23, 23}, {0, 2, 5, 4}})
|
||||
#define case_2 crop_base_params({{1, 5, 32, 32}, {1, 5, 5, 5}, {0, 0, 20, 20}})
|
||||
#define case_3 crop_base_params({{1, 5, 32, 32}, {1, 5, 32, 10}, {0, 0, 0, 20}})
|
||||
#define case_4 crop_base_params({{1, 5, 32, 20}, {1, 5, 30, 10}, {0, 0, 2, 10}})
|
||||
#define case_5 crop_base_params({{1, 5, 32, 20, 14}, {1, 5, 30, 10, 8}, {0, 0, 2, 10, 6}})
|
||||
#define case_6 crop_base_params({{5, 9, 32, 20, 14}, {2, 5, 30, 10, 8}, {3, 4, 2, 10, 6}})
|
||||
|
||||
TEST_P(smoke_CropOnlyTest, TestsCrop) {}
|
||||
|
||||
std::string getTestCaseName(testing::TestParamInfo<crop_test_params> obj) {
|
||||
int ndims = obj.param.in_dims.size();
|
||||
|
||||
return obj.param.device_name +
|
||||
"_in" + std::to_string(obj.param.in_dims[0]) +
|
||||
"_ic" + std::to_string(obj.param.in_dims[1]) +
|
||||
"_id" + std::to_string(ndims == 5 ? obj.param.in_dims[ndims - 3] : 1) +
|
||||
"_ih" + std::to_string(obj.param.in_dims[ndims - 2]) +
|
||||
"_iw" + std::to_string(obj.param.in_dims[ndims - 1]) +
|
||||
"_on" + std::to_string(obj.param.out_dims[0]) +
|
||||
"_oc" + std::to_string(obj.param.out_dims[1]) +
|
||||
"_od" + std::to_string(ndims == 5 ? obj.param.out_dims[ndims - 3] : 1) +
|
||||
"_oh" + std::to_string(obj.param.out_dims[ndims - 2]) +
|
||||
"_ow" + std::to_string(obj.param.out_dims[ndims - 1]);
|
||||
}
|
||||
|
||||
crop_test_params crop_only_test_cases[] = {
|
||||
crop_test_params("CPU", case_1),
|
||||
crop_test_params("CPU", case_2),
|
||||
crop_test_params("CPU", case_3),
|
||||
crop_test_params("CPU", case_4),
|
||||
crop_test_params("CPU", case_5),
|
||||
crop_test_params("CPU", case_6),
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
TestsPooling, smoke_CropOnlyTest, ::testing::ValuesIn(crop_only_test_cases), getTestCaseName);
|
@ -71,6 +71,8 @@ namespace absop
|
||||
break;
|
||||
TYPE_CASE(f32)(arg0, out, count);
|
||||
break;
|
||||
TYPE_CASE(bf16)(arg0, out, count);
|
||||
break;
|
||||
default: rc = false; break;
|
||||
}
|
||||
return rc;
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user