Compare commits

...

7 Commits

Author SHA1 Message Date
Maxim Shevtsov
834755680d fp32 outputs fixup to properly handle negative values (#2529) 2020-10-05 13:51:41 +03:00
Maxim Shevtsov
c8b783f644 Pre.2021.1.submission (#2094)
* fixed code and updated unit tests to accomodate auto-reshaping graphs, to unlock full validation

* [CPU][BF16] bf16 for Gemm or MatMul was enabled (#1920)

# Conflicts:
#	inference-engine/thirdparty/mkl-dnn

* Fuse EmbeddingBag

* [IE CLDNN] Fix result storing in leftover's branch (#2050)

Co-authored-by: Alexey Varyzgin <alexey.varyzgin@intel.com>
Co-authored-by: Vafin, Maxim <maxim.vafin@intel.com>
Co-authored-by: Ilya Znamenskiy <ilya.znamenskiy@intel.com>
2020-09-07 17:24:59 +03:00
Maxim Shevtsov
1e6ca0627a MLPerf's pre.2021.1.submission branch update (DO NOT REVIEW) (#2083)
* fixed code and updated unit tests to accomodate auto-reshaping graphs, to unlock full validation

* [CPU][BF16] bf16 for Gemm or MatMul was enabled (#1920)

# Conflicts:
#	inference-engine/thirdparty/mkl-dnn

* Fuse EmbeddingBag

Co-authored-by: Alexey Varyzgin <alexey.varyzgin@intel.com>
Co-authored-by: Vafin, Maxim <maxim.vafin@intel.com>
2020-09-04 18:20:25 +03:00
Maxim Shevtsov
05a57ebd8e fixed code and updated unit tests to accomodate auto-reshaping graphs, to unlock full validation (#1808) 2020-08-17 20:17:30 +03:00
Maxim Shevtsov
e8a178e196 fixed unit tests to accomodate auto-reshaping graphs, to unlock full validation (#1795) 2020-08-14 21:52:22 +03:00
myshevts
0aead5c070 Fuses duplicated QuantizeLinear and DequantizeLinear nodes, (redundancy in the official NV's int8 MLPerf BERT model that is not good for the OV), per discussion with NV reps 2020-08-14 15:57:51 +03:00
myshevts
dcfaeedb6f multo-graph for automatic dynamic sequence handling via auto-pre-reshaping 2020-08-14 15:57:51 +03:00
19 changed files with 503 additions and 111 deletions

View File

@@ -189,6 +189,15 @@ DECLARE_CONFIG_VALUE(NO);
*/
DECLARE_CONFIG_KEY(CPU_THREADS_NUM);
/**
* @brief If set, enables dynamic sequence recognition for 1D inputs for the CPU, the value defines the min seq boundary
*/
DECLARE_CONFIG_KEY(CPU_DYNAMIC_SEQUENCE);
/**
* @brief Step to pre-reshape for the dynamic sequence recognition
*/
DECLARE_CONFIG_KEY(CPU_DYNAMIC_SEQUENCE_STEP);
/**
* @brief The name for setting CPU affinity per thread option.
*

View File

@@ -224,7 +224,7 @@ public:
(precisionInfo.value == Precision::Q78) || (precisionInfo.value == Precision::I16) ||
(precisionInfo.value == Precision::I8) || (precisionInfo.value == Precision::I32) ||
(precisionInfo.value == Precision::I64) || (precisionInfo.value == Precision::BIN) ||
(precisionInfo.value == Precision::CUSTOM);
(precisionInfo.value == Precision::BF16) || (precisionInfo.value == Precision::CUSTOM);
}
protected:

View File

@@ -13,7 +13,7 @@ namespace MKLDNNPlugin {
class BF16Transformer {
const InferenceEngine::details::caseless_set<std::string> _initbf16 =
{ "convolution", "fullyconnected", "innerproduct" };
{ "convolution", "fullyconnected", "innerproduct", "gemm" };
const InferenceEngine::details::caseless_set<std::string> _complementbf16 =
{ "relu", "tanh", "elu", "square", "abs", "sqrt", "linear", "bounded_relu", "soft_relu", "logistic",
"exp", "gelu", "clamp", "swish", "prelu", "pooling", "norm", "gather", "memory" };

View File

@@ -57,6 +57,14 @@ void Config::readProperties(const std::map<std::string, std::string> &prop) {
// zero and any negative value will be treated
// as default batch size
batchLimit = std::max(val_i, 0);
} else if (key == PluginConfigParams::KEY_CPU_DYNAMIC_SEQUENCE) {
int val_i = std::stoi(val);
// zero and any negative value will be treated
// as default sequence size, so no auto-reshaping will happen
dynamicSequence = std::max(val_i, 0);
} else if (key == PluginConfigParams::KEY_CPU_DYNAMIC_SEQUENCE_STEP) {
int val_i = std::stoi(val);
dynamicSequenceStep = std::max(val_i, 0);
} else if (key == PluginConfigParams::KEY_PERF_COUNT) {
if (val == PluginConfigParams::YES) collectPerfCounters = true;
else if (val == PluginConfigParams::NO) collectPerfCounters = false;
@@ -110,6 +118,15 @@ void Config::readProperties(const std::map<std::string, std::string> &prop) {
}
if (exclusiveAsyncRequests) // Exclusive request feature disables the streams
streamExecutorConfig._streams = 1;
if (dynamicSequence && !dynamicSequenceStep) {
THROW_IE_EXCEPTION << "Dynamic sequence recognition is enabled, but the "
<< PluginConfigParams::KEY_CPU_DYNAMIC_SEQUENCE_STEP
<< " is not set!";
} else if (!dynamicSequence && dynamicSequenceStep) {
THROW_IE_EXCEPTION << "Dynamic sequence recognition " << PluginConfigParams::KEY_CPU_DYNAMIC_SEQUENCE
<< " is not enabled while the " << PluginConfigParams::KEY_CPU_DYNAMIC_SEQUENCE_STEP << " is set!";
}
updateProperties();
}

View File

@@ -21,6 +21,8 @@ struct Config {
bool collectPerfCounters = false;
bool exclusiveAsyncRequests = false;
bool enableDynamicBatch = false;
int dynamicSequence = 0;
int dynamicSequenceStep = 0;
std::string dumpToDot = "";
std::string dumpQuantizedGraphToDot = "";
std::string dumpQuantizedGraphToIr = "";

View File

@@ -38,70 +38,76 @@ MKLDNNExecNetwork::CreateInferRequestImpl(InferenceEngine::InputsDataMap network
return std::make_shared<MKLDNNInferRequest>(networkInputs, networkOutputs, std::static_pointer_cast<MKLDNNExecNetwork>(shared_from_this()));
}
MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network,
MKLDNNExecNetwork::MKLDNNExecNetwork(ReshapedCNNNetworks networks,
const Config &cfg,
const MKLDNNExtensionManager::Ptr& extMgr,
NumaNodesWeights &numaNodesWeights) :
InferenceEngine::ExecutableNetworkThreadSafeDefault{nullptr, nullptr},
extensionManager(extMgr),
_cfg{cfg},
_name{network.getName()} {
_name{networks.begin()->second.getName()} {
OV_ITT_SCOPED_TASK(itt::domains::MKLDNNPlugin, "MKLDNNExecNetwork::MKLDNNExecNetwork");
// we are cloning network if we have statistics and we can transform network.
_clonedNetwork = cloneNet(network);
typedef std::map<int, InferenceEngine::details::CNNNetworkImplPtr, sorting_order> PluginInternalNetworks;
PluginInternalNetworks plugin_internal_networks;
for (auto n : networks) {
// we are cloning network if we have statistics and we can transform network.
auto _clonedNetwork = cloneNet(n.second);
if (_cfg.lpTransformsMode == Config::LPTransformsMode::On) {
auto params = LayerTransformation::Params(true, // updatePrecisions
true, // quantizeOutputs
true, // weightsToConst
LayerTransformation::QuantizedTensorAlignment::UpdateLevel, // quantizedTensorAlignmentOnActivations
LayerTransformation::QuantizedTensorAlignment::None, // quantizedTensorAlignmentOnWeights
true, // roundQuantizedValues
true, // updateBiases
true); // supportAsymmetricQuantization
LowPrecisionTransformer transformer(LowPrecisionTransformer::getAllTransformations(params).
add<ConvolutionTransformation>(LayerTransformation::Params(params).setPrecisionsOnActivations({ Precision::U8 }), "Convolution").
addCleanup<ScaleShiftToConvolutionTransformation>(
LayerTransformation::Params(params).setPrecisionsOnActivations({ Precision::U8 }),
"ScaleShift"));
transformer.transform(*_clonedNetwork);
if (_cfg.lpTransformsMode == Config::LPTransformsMode::On) {
auto params = LayerTransformation::Params(true, // updatePrecisions
true, // quantizeOutputs
true, // weightsToConst
LayerTransformation::QuantizedTensorAlignment::UpdateLevel, // quantizedTensorAlignmentOnActivations
LayerTransformation::QuantizedTensorAlignment::None, // quantizedTensorAlignmentOnWeights
true, // roundQuantizedValues
true, // updateBiases
true); // supportAsymmetricQuantization
LowPrecisionTransformer transformer(LowPrecisionTransformer::getAllTransformations(params).
add<ConvolutionTransformation>(
LayerTransformation::Params(params).setPrecisionsOnActivations({Precision::U8}), "Convolution").
addCleanup<ScaleShiftToConvolutionTransformation>(
LayerTransformation::Params(params).setPrecisionsOnActivations({Precision::U8}),
"ScaleShift"));
transformer.transform(*_clonedNetwork);
// Check if network is INT8 or Binary.
// BF16 transformations were disabled since CPU plug-in doesn't support mixed precision execution:
// BF16 + INT8 or BF16 + BIN.
bool isFloatModel = true;
CNNNetworkIterator i(&network);
while (i != CNNNetworkIterator()) {
if (CaselessEq<std::string>()((*i)->type, "FakeQuantize")) {
isFloatModel = false;
break;
// Check if network is INT8 or Binary.
// BF16 transformations were disabled since CPU plug-in doesn't support mixed precision execution:
// BF16 + INT8 or BF16 + BIN.
bool isFloatModel = true;
CNNNetworkIterator i(&n.second.operator InferenceEngine::ICNNNetwork &());
while (i != CNNNetworkIterator()) {
if (CaselessEq<std::string>()((*i)->type, "FakeQuantize")) {
isFloatModel = false;
break;
}
i++;
}
if (with_cpu_x86_bfloat16() && isFloatModel) {
BF16Transformer bf16Transformer;
CNNNetwork cnnetwork(_clonedNetwork);
// If enforceBF16 flag was set, BF16 transformation applies for all layers supported by CPU plugin.
// Overwise, only layers marked as BF16 in 'cnnetwork' will be performed in bfloat16 mode.
// CPU plugin throws an exception, if marked as BF16 layers have not supported by CPU plugin.
if (cfg.enforceBF16 == true)
bf16Transformer.convertToBFloat16(cnnetwork);
} else {
BF16Transformer bf16Transformer;
CNNNetwork cnnetwork(_clonedNetwork);
bf16Transformer.convertToFloat(cnnetwork);
}
i++;
}
if (with_cpu_x86_bfloat16() && isFloatModel) {
BF16Transformer bf16Transformer;
CNNNetwork cnnetwork(_clonedNetwork);
// If enforceBF16 flag was set, BF16 transformation applies for all layers supported by CPU plugin.
// Overwise, only layers marked as BF16 in 'cnnetwork' will be performed in bfloat16 mode.
// CPU plugin throws an exception, if marked as BF16 layers have not supported by CPU plugin.
if (cfg.enforceBF16 == true)
bf16Transformer.convertToBFloat16(cnnetwork);
} else {
BF16Transformer bf16Transformer;
CNNNetwork cnnetwork(_clonedNetwork);
bf16Transformer.convertToFloat(cnnetwork);
}
}
MKLDNNGraph::ApplyUnrollPasses(static_cast<ICNNNetwork&>(*_clonedNetwork));
MKLDNNGraph::ApplyUnrollPasses(static_cast<ICNNNetwork &>(*_clonedNetwork));
if (_cfg.enableDynamicBatch) {
// check topology for applicability
if (!CanProcessDynBatch(*_clonedNetwork)) {
THROW_IE_EXCEPTION << "MKLDNNGraph::CreateGraph: such topology cannot be compiled for dynamic batch!";
// check topology for applicability
if (!CanProcessDynBatch(*_clonedNetwork)) {
THROW_IE_EXCEPTION << "MKLDNNGraph::CreateGraph: such topology cannot be compiled for dynamic batch!";
}
}
plugin_internal_networks[n.first] = _clonedNetwork;
}
if (cfg.exclusiveAsyncRequests) {
@@ -131,19 +137,23 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network
_graphs = decltype(_graphs){[&] {
// TODO: Remove `cloneNet` to `localNetwork` when `MKLDNNGraph::CreateGraph`
// is fixed and does not change content of network passed (CVS-26420)
auto localNetwork = cloneNet(static_cast<ICNNNetwork&>(*_clonedNetwork));
auto graph = std::make_shared<MKLDNNGraph>();
{
std::unique_lock<std::mutex> lock{_cfgMutex};
graph->setConfig(_cfg);
SequenceGraphs m;
for (auto n : plugin_internal_networks) {
auto localNetwork = cloneNet(static_cast<ICNNNetwork&>(*n.second.get()));
auto graph = std::make_shared<MKLDNNGraph>();
{
std::unique_lock<std::mutex> lock{_cfgMutex};
graph->setConfig(_cfg);
}
int numaNode = 0;
auto *streamExecutor = dynamic_cast<InferenceEngine::IStreamsExecutor *>(_taskExecutor.get());
if (nullptr != streamExecutor) {
numaNode = streamExecutor->GetNumaNodeId();
}
graph->CreateGraph(static_cast<ICNNNetwork &>(*localNetwork), extensionManager, numaNodesWeights[numaNode]);
m[n.first] = graph;
}
int numaNode = 0;
auto* streamExecutor = dynamic_cast<InferenceEngine::IStreamsExecutor*>(_taskExecutor.get());
if (nullptr != streamExecutor) {
numaNode = streamExecutor->GetNumaNodeId();
}
graph->CreateGraph(static_cast<ICNNNetwork&>(*localNetwork), extensionManager, numaNodesWeights[numaNode]);
return graph;
return m;
}};
_taskExecutor->runAndWait({std::thread::hardware_concurrency(), [this] {_graphs.local();}});
@@ -152,7 +162,7 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network
// of MemoryLayer implementation. It uses output edge of MemoryLayer
// producer as storage for tensor to keep it between infer calls.
if (_graphs.size() == 1) {
for (auto &node : _graphs.begin()->get()->GetNodes()) {
for (auto &node : _graphs.begin()->begin()->second->GetNodes()) {
if (node->getType() == MemoryInput) {
auto memoryNode = dynamic_cast<MKLDNNMemoryInputNode*>(node.get());
auto state_store = memoryNode->getStore();
@@ -174,9 +184,9 @@ void MKLDNNExecNetwork::setProperty(const std::map<std::string, std::string> &pr
std::lock_guard<std::mutex> lock{_cfgMutex};
_cfg.readProperties(properties);
}
for (auto g : _graphs) {
g->setProperty(properties);
}
for (auto g : _graphs)
for (auto s : g)
s.second->setProperty(properties);
}
void MKLDNNExecNetwork::CreateInferRequest(InferenceEngine::IInferRequest::Ptr &asyncRequest) {
@@ -193,13 +203,13 @@ void MKLDNNExecNetwork::GetExecGraphInfo(InferenceEngine::ICNNNetwork::Ptr &grap
if (_graphs.size() == 0)
THROW_IE_EXCEPTION << "No graph was found";
graphPtr = _graphs.begin()->get()->dump();
graphPtr = _graphs.begin()->begin()->second->dump();
}
void MKLDNNExecNetwork::GetConfig(const std::string &name, Parameter &result, ResponseDesc *resp) const {
if (_graphs.size() == 0)
THROW_IE_EXCEPTION << "No graph was found";
Config engConfig = _graphs.begin()->get()->getProperty();
Config engConfig = _graphs.begin()->begin()->second->getProperty();
auto option = engConfig._config.find(name);
if (option != engConfig._config.end()) {
result = option->second;
@@ -213,9 +223,9 @@ void MKLDNNExecNetwork::GetMetric(const std::string &name, Parameter &result, Re
THROW_IE_EXCEPTION << "No graph was found";
if (name == METRIC_KEY(NETWORK_NAME)) {
if (_graphs.begin()->get()->dump() == nullptr)
if (_graphs.begin()->begin()->second->dump() == nullptr)
THROW_IE_EXCEPTION << "Invalid graph dump";
result = IE_SET_METRIC(NETWORK_NAME, _graphs.begin()->get()->dump()->getName());
result = IE_SET_METRIC(NETWORK_NAME, _graphs.begin()->begin()->second->dump()->getName());
} else if (name == METRIC_KEY(SUPPORTED_METRICS)) {
std::vector<std::string> metrics;
metrics.push_back(METRIC_KEY(NETWORK_NAME));
@@ -225,12 +235,12 @@ void MKLDNNExecNetwork::GetMetric(const std::string &name, Parameter &result, Re
result = IE_SET_METRIC(SUPPORTED_METRICS, metrics);
} else if (name == METRIC_KEY(SUPPORTED_CONFIG_KEYS)) {
std::vector<std::string> configKeys;
for (auto && key : _graphs.begin()->get()->getProperty()._config) {
for (auto && key : _graphs.begin()->begin()->second->getProperty()._config) {
configKeys.push_back(key.first);
}
result = IE_SET_METRIC(SUPPORTED_CONFIG_KEYS, configKeys);
} else if (name == METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS)) {
Config engConfig = _graphs.begin()->get()->getProperty();
Config engConfig = _graphs.begin()->begin()->second->getProperty();
auto option = engConfig._config.find(CONFIG_KEY(CPU_THROUGHPUT_STREAMS));
IE_ASSERT(option != engConfig._config.end());
auto streams = std::stoi(option->second);

View File

@@ -18,6 +18,9 @@
#include <unordered_map>
namespace MKLDNNPlugin {
typedef std::less<int> sorting_order;
typedef std::map<int, MKLDNNGraph::Ptr, sorting_order> SequenceGraphs;
typedef std::map<int, InferenceEngine::CNNNetwork, sorting_order> ReshapedCNNNetworks;
class MKLDNNExecNetwork: public InferenceEngine::ExecutableNetworkThreadSafeDefault {
public:
@@ -29,7 +32,7 @@ public:
void CreateInferRequest(InferenceEngine::IInferRequest::Ptr &asyncRequest) override;
MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network, const Config &cfg,
MKLDNNExecNetwork(ReshapedCNNNetworks, const Config &cfg,
const MKLDNNExtensionManager::Ptr &extMgr, NumaNodesWeights &weightsSharing);
~MKLDNNExecNetwork() override = default;
@@ -44,13 +47,12 @@ public:
std::vector<InferenceEngine::IMemoryStateInternal::Ptr> QueryState() override;
InferenceEngine::ThreadLocal<MKLDNNGraph::Ptr> _graphs;
InferenceEngine::ThreadLocal<SequenceGraphs> _graphs;
protected:
friend class MKLDNNInferRequest;
MKLDNNExtensionManager::Ptr extensionManager;
std::vector<InferenceEngine::IMemoryStateInternal::Ptr> memoryStates;
InferenceEngine::details::CNNNetworkImplPtr _clonedNetwork;
std::mutex _cfgMutex;
Config _cfg;
std::atomic_int _numRequests = {0};

View File

@@ -758,9 +758,14 @@ void MKLDNNGraph::PullOutputData(BlobMap &out) {
ext_blob->allocate();
}
if (ext_blob->byteSize() != intr_blob.GetSize())
if (config.dynamicSequence) {
if (ext_blob->byteSize() < intr_blob.GetSize())
THROW_IE_EXCEPTION << "Output blob size is less than network output size ("
<< ext_blob->size() << "<" << intr_blob.GetSize() / sizeof(float) << ").";
} else if (ext_blob->byteSize() != intr_blob.GetSize()) {
THROW_IE_EXCEPTION << "Output blob size is not equal network output size ("
<< ext_blob->size() << "!=" << intr_blob.GetSize()/sizeof(float) << ").";
<< ext_blob->size() << "!=" << intr_blob.GetSize() / sizeof(float) << ").";
}
void *ext_blob_ptr = ext_blob->buffer();
void *intr_blob_ptr = intr_blob.GetData();
@@ -776,6 +781,13 @@ void MKLDNNGraph::PullOutputData(BlobMap &out) {
size_t size_to_copy = intr_blob.GetSize() * MB_to_process / MB;
ie_memcpy(ext_blob_ptr, ext_blob->byteSize(), intr_blob_ptr, size_to_copy);
if (config.dynamicSequence && ext_blob->size() > intr_blob.GetElementsCount()) {
if (ext_blob->getTensorDesc().getPrecision() != InferenceEngine::Precision::FP32)
THROW_IE_EXCEPTION << "Dynamic sequence is supported only for the fp32 outputs only!";
auto elements = intr_blob.GetElementsCount();
std::fill(static_cast<float*>(ext_blob_ptr) + elements,
static_cast<float*>(ext_blob_ptr) + ext_blob->size(), -std::numeric_limits<float>::max());
}
}
}

View File

@@ -24,7 +24,10 @@ MKLDNNPlugin::MKLDNNInferRequest::MKLDNNInferRequest(InferenceEngine::InputsData
if (execNetwork->_graphs.size() == 0)
THROW_IE_EXCEPTION << "No graph was found";
graph = execNetwork->_graphs.begin()->get();
const int seq = execNetwork->_graphs.begin()->size() > 1
? _networkInputs.cbegin()->second->getTensorDesc().getDims()[1]
: 0;
graph = execNetwork->_graphs.begin()->at(seq).get();
for (const auto& it : _networkInputs) {
InferenceEngine::Blob::Ptr blob;
MKLDNNInferRequest::GetBlob(it.first.c_str(), blob);
@@ -79,13 +82,28 @@ void copyToFloat(float* dst, const InferenceEngine::Blob* src) {
void MKLDNNPlugin::MKLDNNInferRequest::InferImpl() {
using namespace openvino::itt;
const bool dyn_sequence = execNetwork->_graphs.local().size() > 1;
auto dims = _inputs.cbegin()->second->getTensorDesc().getDims();
OV_ITT_SCOPED_TASK(itt::domains::MKLDNNPlugin, profilingTask);
graph = execNetwork->_graphs.local().get();
if (dyn_sequence) {
// graph per sequence
const int *ptr = _inputs.cbegin()->second->buffer().as<int *>();
auto sz = _inputs.cbegin()->second->size();
const int size_non_zero = std::distance(ptr,
std::find_if(ptr, ptr + sz, [](int x) { return x == 0; }));
const int actual_seq = execNetwork->_graphs.local().lower_bound(size_non_zero)->first;
// std::cout << "Last non-zero : " << size_non_zero << ", Actual Seq : " << actual_seq << std::endl;
graph = execNetwork->_graphs.local()[actual_seq].get();
dims[1] = actual_seq;
} else {
graph = execNetwork->_graphs.local().begin()->second.get();
}
{
execDataPreprocessing(_inputs);
changeDefaultPtr();
if (!dyn_sequence)
changeDefaultPtr();
// need to retain converted blobs until infer finish
std::vector<InferenceEngine::Blob::Ptr> convertedInputs;
@@ -103,7 +121,16 @@ void MKLDNNPlugin::MKLDNNInferRequest::InferImpl() {
pushInput<float>(input.first, input.second);
break;
case InferenceEngine::Precision::I32:
pushInput<int32_t>(input.first, input.second);
if (dyn_sequence) {
iconv = InferenceEngine::make_shared_blob<int32_t>({InferenceEngine::Precision::I32,
dims,
input.second->getTensorDesc().getLayout()},
input.second->buffer());
convertedInputs.push_back(iconv);
pushInput<int32_t>(input.first, iconv);
} else {
pushInput<int32_t>(input.first, input.second);
}
break;
case InferenceEngine::Precision::I8:
pushInput<int8_t>(input.first, input.second);

View File

@@ -158,28 +158,51 @@ Engine::LoadExeNetworkImpl(const InferenceEngine::ICNNNetwork &network, const st
conf.batchLimit = static_cast<int>(network.getBatchSize());
}
std::shared_ptr<ICNNNetwork> clonedNetwork = cloneNetwork(network);
bool is_transformed = false;
if (clonedNetwork->getFunction()) {
Transformation(clonedNetwork);
is_transformed = true;
CNNNetwork localNetwork(cloneNetwork(network));
const InputsDataMap inputInfo = localNetwork.getInputsInfo();
ICNNNetwork::InputShapes shapes = localNetwork.getInputShapes();
ReshapedCNNNetworks reshapedNetworks;
int seq = 0;
if (conf.dynamicSequence) {
if (shapes.at(inputInfo.cbegin()->first).size() < 2)
THROW_IE_EXCEPTION << "Auto-reshaping of the network with no sequence (first input is scalar or channels-only)!";
seq = shapes.at(inputInfo.cbegin()->first)[1];
}
auto implNetwork = std::dynamic_pointer_cast<details::CNNNetworkImpl>(clonedNetwork);
if (implNetwork) {
// valid for CNNNetworkImpl only, while there's no API in ICNNNetwork to change network
ConstTransformer transformator(implNetwork.get());
transformator.fullTrim();
if (!is_transformed) {
NetPass::ConvertPrecision(*implNetwork, Precision::I64, Precision::I32);
NetPass::ConvertPrecision(*implNetwork, Precision::U64, Precision::I32);
NetPass::ConvertPrecision(*implNetwork, Precision::U32, Precision::I32);
NetPass::ConvertPrecision(*implNetwork, Precision::FP16, Precision::FP32);
NetPass::ConvertPrecision(*implNetwork, Precision::BOOL, Precision::U8);
NetPass::ConvertPrecision(*implNetwork, Precision::U16, Precision::I32);
do {
CNNNetwork clonedNetwork(cloneNetwork(network));
if (conf.dynamicSequence) {
for (const InputsDataMap::value_type &item : inputInfo)
shapes[item.first][1] = seq;
// std::cout << "Reshaped network by sequence to " << seq << std::endl;
clonedNetwork.reshape(shapes);
}
}
bool is_transformed = false;
if (clonedNetwork.getFunction()) {
auto temp = clonedNetwork.operator ICNNNetwork::Ptr();
Transformation(temp);
clonedNetwork = CNNNetwork(temp);
is_transformed = true;
}
auto implNetwork = std::dynamic_pointer_cast<details::CNNNetworkImpl>(
clonedNetwork.operator ICNNNetwork::Ptr());
if (implNetwork) {
// valid for CNNNetworkImpl only, while there's no API in ICNNNetwork to change network
ConstTransformer transformator(implNetwork.get());
transformator.fullTrim();
if (!is_transformed) {
NetPass::ConvertPrecision(*implNetwork, Precision::I64, Precision::I32);
NetPass::ConvertPrecision(*implNetwork, Precision::U64, Precision::I32);
NetPass::ConvertPrecision(*implNetwork, Precision::U32, Precision::I32);
NetPass::ConvertPrecision(*implNetwork, Precision::FP16, Precision::FP32);
NetPass::ConvertPrecision(*implNetwork, Precision::BOOL, Precision::U8);
NetPass::ConvertPrecision(*implNetwork, Precision::U16, Precision::I32);
}
}
reshapedNetworks[seq] = clonedNetwork;
seq -= conf.dynamicSequenceStep;
} while (conf.dynamicSequence && seq >= conf.dynamicSequence);
return std::make_shared<MKLDNNExecNetwork>(*clonedNetwork, conf, extensionManager, weightsSharing);
return std::make_shared<MKLDNNExecNetwork>(reshapedNetworks, conf, extensionManager, weightsSharing);
}
void Engine::SetConfig(const std::map<std::string, std::string> &config) {

View File

@@ -122,8 +122,13 @@ void MKLDNNGemmNode::initSupportedPrimitiveDescriptors() {
auto inPrec0 = getCnnLayer()->insData[0].lock()->getPrecision();
auto inPrec1 = getCnnLayer()->insData[1].lock()->getPrecision();
if ((inPrec0 != Precision::U8 && inPrec0 != Precision::I8) || inPrec1 != Precision::I8 || isThreeInputs) {
inPrec0 = Precision::FP32;
inPrec1 = Precision::FP32;
if (inPrec0 == Precision::BF16 || inPrec1 == Precision::BF16) {
inPrec0 = Precision::BF16;
inPrec1 = Precision::BF16;
} else {
inPrec0 = Precision::FP32;
inPrec1 = Precision::FP32;
}
}
auto inputDataType0 = MKLDNNExtensionUtils::IEPrecisionToDataType(inPrec0);
@@ -192,6 +197,11 @@ inline void process_gemm(char transa, char transb, int M, int N, int K, float al
mkldnn_sgemm(transa, transb, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
}
inline void process_gemm(char transa, char transb, int M, int N, int K, float alpha, const uint16_t *A, int lda,
const uint16_t *B, int ldb, float beta, float *C, int ldc) {
mkldnn_gemm_bf16bf16f32(transa, transb, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
}
inline void process_gemm(char transa, char transb, int M, int N, int K, float alpha, const uint8_t *A, int lda,
const int8_t *B, int ldb, float beta, float *C, int ldc) {
const int32_t co = 0;
@@ -288,6 +298,9 @@ void MKLDNNGemmNode::execute(mkldnn::stream strm) {
case Precision::FP32:
process_data<float, float>();
break;
case Precision::BF16:
process_data<uint16_t, uint16_t>();
break;
case Precision::I8:
process_data<int8_t, int8_t>();
break;

View File

@@ -16,7 +16,7 @@ using namespace mkldnn;
class MKLDNNTestExecNetwork: public MKLDNNPlugin::MKLDNNExecNetwork {
public:
MKLDNNPlugin::MKLDNNGraph& getGraph() {
return *(_graphs.begin()->get());
return *(_graphs.begin()->begin()->second);
}
};

View File

@@ -1198,7 +1198,7 @@ TEST_F(MKLDNNGraphStructureTests, TestOutputAfterInplacePlusConcat) {
InferenceEngine::Core core;
InferenceEngine::CNNNetwork network;
ASSERT_NO_THROW(network = core.ReadNetwork(model, InferenceEngine::Blob::CPtr()));
MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork(network, {}, {}, cache));
MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork({{0, network}}, {}, {}, cache));
InferenceEngine::InputsDataMap _networkInputs = network.getInputsInfo();
InferenceEngine::OutputsDataMap _networkOutputs = network.getOutputsInfo();
execNetwork->setNetworkInputs(_networkInputs);
@@ -1714,7 +1714,7 @@ TEST_F(MKLDNNGraphStructureTests, TestResnetPart) {
InferenceEngine::CNNNetwork network;
ASSERT_NO_THROW(network = core.ReadNetwork(model, weights_ptr));
MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork(network, {}, {}, cache));
MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork({{0, network}}, {}, {}, cache));
InferenceEngine::InputsDataMap _networkInputs = network.getInputsInfo();
InferenceEngine::OutputsDataMap _networkOutputs = network.getOutputsInfo();
execNetwork->setNetworkInputs(_networkInputs);
@@ -1864,7 +1864,7 @@ TEST_F(MKLDNNGraphStructureTests, TestConcatAfterConcat) {
InferenceEngine::Core core;
InferenceEngine::CNNNetwork network;
ASSERT_NO_THROW(network = core.ReadNetwork(model, InferenceEngine::Blob::CPtr()));
MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork(network, {}, {}, cache));
MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork({{0, network}}, {}, {}, cache));
InferenceEngine::InputsDataMap _networkInputs = network.getInputsInfo();
InferenceEngine::OutputsDataMap _networkOutputs = network.getOutputsInfo();
execNetwork->setNetworkInputs(_networkInputs);
@@ -2045,7 +2045,7 @@ TEST_F(MKLDNNGraphStructureTests, Test2ConcatFromConcat) {
InferenceEngine::Core core;
InferenceEngine::CNNNetwork network;
ASSERT_NO_THROW(network = core.ReadNetwork(model, InferenceEngine::Blob::CPtr()));
MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork(network, {}, {}, cache));
MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork({{0, network}}, {}, {}, cache));
InferenceEngine::InputsDataMap _networkInputs = network.getInputsInfo();
InferenceEngine::OutputsDataMap _networkOutputs = network.getOutputsInfo();
execNetwork->setNetworkInputs(_networkInputs);
@@ -2377,7 +2377,7 @@ TEST_F(MKLDNNGraphStructureTests, TestLoadTopologyWithConstLayer) {
InferenceEngine::CNNNetwork network;
ASSERT_NO_THROW(network = core.ReadNetwork(model, weights_ptr));
MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork(network, {}, {}, cache));
MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork({{0, network}}, {}, {}, cache));
InferenceEngine::InputsDataMap _networkInputs = network.getInputsInfo();
InferenceEngine::OutputsDataMap _networkOutputs = network.getOutputsInfo();
execNetwork->setNetworkInputs(_networkInputs);
@@ -2525,7 +2525,7 @@ TEST_F(MKLDNNGraphStructureTests, TestLoadTopologyWithEltwiseBeforeConcat) {
InferenceEngine::CNNNetwork network;
ASSERT_NO_THROW(network = core.ReadNetwork(model, weights_ptr));
MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork(network, {}, {}, cache));
MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork({{0, network}}, {}, {}, cache));
InferenceEngine::InputsDataMap _networkInputs = network.getInputsInfo();
InferenceEngine::OutputsDataMap _networkOutputs = network.getOutputsInfo();
execNetwork->setNetworkInputs(_networkInputs);

View File

@@ -76,6 +76,7 @@ JitConstants SoftmaxKerneItemsClassOptimized::GetJitConstants(const softmax_para
auto jit = SoftmaxItemsClassKernelBase::GetJitConstants(params, kd);
jit.AddConstant(MakeJitConstant("WORKITEMS_PER_CLASSES", workitems_per_classes));
jit.AddConstant(MakeJitConstant("HAS_DRIVER_PROBLEMS", params.engineInfo.bIMADSupport));
return jit;
}

View File

@@ -63,12 +63,24 @@ KERNEL(softmax_items_class_optimized)(__global INPUT0_TYPE* input, __global OUTP
ACCUMULATOR_TYPE denominator = 0.0;
for (uint cls = 0; cls < FULL_ITERATIONS_NUM; cls++)
{
// This is a temporary solution for unresolved problem when ocl kernels compilation step doesn't produce actual binaries
// for current kernel but driver doesn't report any errors (JIRA CVS-32211)
#if HAS_DRIVER_PROBLEMS
data[cls] = data[cls] == max_value ? 1.0 : native_exp(data[cls] - max_value);
#else
data[cls] = native_exp(data[cls] - max_value);
#endif
denominator += data[cls];
}
if(simd_lane < LEFTOVERS)
{
// This is a temporary solution for unresolved problem when ocl kernels compilation step doesn't produce actual binaries
// for current kernel but driver doesn't report any errors (JIRA CVS-32211)
#if HAS_DRIVER_PROBLEMS
data[DATA_PER_WORKITEM-1] = data[DATA_PER_WORKITEM-1] == max_value ? 1.0 : native_exp(data[DATA_PER_WORKITEM-1] - max_value);
#else
data[DATA_PER_WORKITEM-1] = native_exp(data[DATA_PER_WORKITEM-1] - max_value);
#endif
denominator += data[DATA_PER_WORKITEM-1];
}

View File

@@ -119,6 +119,7 @@ extensions/front/create_tensor_nodes.py
extensions/front/disable_weights_quantize_value_propagation.py
extensions/front/div.py
extensions/front/eltwise_n.py
extensions/front/EmbeddingBagFuse.py
extensions/front/ExpandDimsToUnsqueeze.py
extensions/front/FillToBroadcast.py
extensions/front/flatten_to_reshape.py
@@ -286,6 +287,7 @@ extensions/front/onnx/priorbox_ext.py
extensions/front/onnx/priorgridgenerator_ext.py
extensions/front/onnx/proposal_ext.py
extensions/front/onnx/quantize_dequantize_linear.py
extensions/front/onnx/quantize_dequantize_redundant.py
extensions/front/onnx/quantize_ext.py
extensions/front/onnx/quantize_linear_ext.py
extensions/front/onnx/quantize_linear_resolver.py

View File

@@ -0,0 +1,91 @@
"""
Copyright (C) 2020 Intel Corporation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from extensions.ops.embedding_bag import EmbeddingBagOffsetsSum
from mo.front.common.replacement import FrontReplacementSubgraph
from mo.graph.graph import Graph, rename_nodes
class EmbeddingBagFuse(FrontReplacementSubgraph):
enabled = True
def run_after(self):
from extensions.front.ExpandDimsToUnsqueeze import ExpandDimsToUnsqueeze
from extensions.front.AttributedGatherNormalizer import AttributedGatherNormalizer
return [ExpandDimsToUnsqueeze, AttributedGatherNormalizer]
def pattern(self):
return dict(
nodes=[
('weights', dict(op='Const')),
('concat_before', dict(op='Concat')),
('gather_before1_1', dict(op='Gather')),
('unsqueeze_before1_1', dict(op='Unsqueeze')),
('gather_before2_1', dict(op='Gather')),
('unsqueeze_before2_1', dict(op='Unsqueeze')),
('slice1', dict(op='Slice')),
('gather_after1', dict(op='Gather')),
('reduce1', dict(op='ReduceSum')),
('unsqueeze_after1', dict(op='Unsqueeze')),
('concat_after', dict(op='Concat')),
],
edges=[
('concat_before', 'gather_before1_1'),
('concat_before', 'gather_before2_1'),
('gather_before1_1', 'unsqueeze_before1_1'),
('gather_before2_1', 'unsqueeze_before2_1'),
('unsqueeze_before1_1', 'slice1', {'out': 0, 'in': 1}),
('unsqueeze_before2_1', 'slice1', {'out': 0, 'in': 2}),
('weights', 'gather_after1', {'out': 0, 'in': 0}),
('slice1', 'gather_after1', {'out': 0, 'in': 1}),
('gather_after1', 'reduce1'),
('reduce1', 'unsqueeze_after1'),
('unsqueeze_after1', 'concat_after'),
])
def replace_sub_graph(self, graph: Graph, match: dict):
concat_before = match['concat_before']
gather_after1 = match['gather_after1']
slice1 = match['slice1']
concat_after = match['concat_after']
weights_node = gather_after1.in_port(0).get_source().node
gather_after_axis = gather_after1.in_port(2).get_source().node.soft_get('value')
for dst_port in weights_node.out_port(0).get_destinations():
node = dst_port.node
if node.op == 'Gather':
# validate that all Gathers have same axis
if node.in_port(2).get_source().node.soft_get('value') != gather_after_axis:
return
dst_port.disconnect()
indices_node = slice1.in_port(0).get_source().node
slice_axis = slice1.in_port(3).get_source().node.soft_get('value')
for dst_port in indices_node.out_port(0).get_destinations():
node = dst_port.node
if node.op == 'Slice':
# validate that all Slices have same axis
if node.in_port(3).get_source().node.soft_get('value') != slice_axis:
return
dst_port.disconnect()
emb_bag = EmbeddingBagOffsetsSum(graph, {}).create_node()
weights_node.out_port(0).connect(emb_bag.in_port(0))
indices_node.out_port(0).connect(emb_bag.in_port(1))
concat_before.in_port(0).get_connection().set_destination(emb_bag.in_port(2))
concat_after.out_port(0).get_connection().set_source(emb_bag.out_port(0))
concat_name = concat_after.soft_get('name', concat_after.id)
rename_nodes([(concat_after, concat_name + '/TBD'), (emb_bag, concat_name)])
# remove this sub-graph since a lot of matchings will be obsolete
graph.remove_nodes_from(graph.dfs(concat_before.id, set()))

View File

@@ -0,0 +1,171 @@
"""
Copyright (C) 2018-2020 Intel Corporation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import logging as log
from mo.front.common.replacement import FrontReplacementSubgraph
from mo.front.subgraph_matcher import SubgraphMatch
from mo.graph.graph import Graph
class QuantizeDequantizeRedundant2(FrontReplacementSubgraph):
"""
Fuses duplicated QuantizeLinear and DequantizeLinear nodes
(redundancy in the official NV's int8 MLPerf BERT model)
Covers cases when the values for zero point and scale are same in both QuantizeLinear and DequantizeLinear.
"""
enabled = True
def run_before(self):
from extensions.front.onnx.quantize_dequantize_linear import QuantizeDequantizeLinear
return [QuantizeDequantizeLinear]
def pattern(self):
return dict(
nodes=[
('inp', dict(op='Add')),
('quantize0', dict(op='QuantizeLinear')),
('dequantize0', dict(op='DequantizeLinear')),
('quantize1', dict(op='QuantizeLinear')),
('dequantize1', dict(op='DequantizeLinear')),
],
edges=[
('inp', 'quantize0', {'in': 0}),
('inp', 'quantize1', {'in': 0}),
('quantize0', 'dequantize0', {'in': 0}),
('quantize1', 'dequantize1', {'in': 0}),
]
)
def replace_sub_graph(self, graph: Graph, match: [dict, SubgraphMatch]):
q0 = match['quantize0']
q1 = match['quantize1']
q0_scale = q0.in_port(1).get_source().node
q0_zerop = q0.in_port(2).get_source().node
q1_scale = q1.in_port(1).get_source().node
q1_zerop = q1.in_port(2).get_source().node
inp_port = q0.in_port(0).get_source()
name = inp_port.node.soft_get('name', inp_port.node.id)
# only constant as for zero_point/scale supported
if q0_scale.soft_get('type') == 'Const' and q1_scale.soft_get('type') == 'Const' and \
q0_zerop.soft_get('type') == 'Const' and q1_zerop.soft_get('type') == 'Const':
# only patterns with same scale/zero_point values for Q and DQ are supported
if q0_scale.value == q1_scale.value and q0_zerop.value == q1_zerop.value:
log.debug('Redundant 2Q-DQ pattern after {}'.format(name))
dests = match['dequantize1'].out_port(0).get_destinations()
for dest in dests:
dest.disconnect()
dest.connect(match['dequantize0'].out_port(0))
graph.remove_nodes_from([match['quantize1'].id, match['dequantize1'].id])
else:
log.error('QuantizeLinears in the fan-out have different scale or zero-point values, '
'cannot removed!'.format(name))
class QuantizeDequantizeRedundant4(FrontReplacementSubgraph):
"""
Fuses duplicated QuantizeLinear and DequantizeLinear nodes
(redundancy in the official NV's int8 MLPerf BERT model)
Covers cases when the values for zero point and scale are same in both QuantizeLinear and DequantizeLinear.
"""
enabled = True
def run_before(self):
return [QuantizeDequantizeRedundant2]
def pattern(self):
return dict(
nodes=[
('inp', dict(op='Add')),
('quantize0', dict(op='QuantizeLinear')),
('dequantize0', dict(op='DequantizeLinear')),
('quantize1', dict(op='QuantizeLinear')),
('dequantize1', dict(op='DequantizeLinear')),
('quantize2', dict(op='QuantizeLinear')),
('dequantize2', dict(op='DequantizeLinear')),
('quantize3', dict(op='QuantizeLinear')),
('dequantize3', dict(op='DequantizeLinear')),
],
edges=[
('inp', 'quantize0', {'in': 0}),
('inp', 'quantize1', {'in': 0}),
('inp', 'quantize2', {'in': 0}),
('inp', 'quantize3', {'in': 0}),
('quantize0', 'dequantize0', {'in': 0}),
('quantize1', 'dequantize1', {'in': 0}),
('quantize2', 'dequantize2', {'in': 0}),
('quantize3', 'dequantize3', {'in': 0}),
]
)
def replace_sub_graph(self, graph: Graph, match: [dict, SubgraphMatch]):
q0 = match['quantize0']
q1 = match['quantize1']
q2 = match['quantize2']
q3 = match['quantize3']
q0_scale = q0.in_port(1).get_source().node
q0_zerop = q0.in_port(2).get_source().node
q1_scale = q1.in_port(1).get_source().node
q1_zerop = q1.in_port(2).get_source().node
q2_scale = q2.in_port(1).get_source().node
q2_zerop = q2.in_port(2).get_source().node
q3_scale = q3.in_port(1).get_source().node
q3_zerop = q3.in_port(2).get_source().node
inp_port = q0.in_port(0).get_source()
name = inp_port.node.soft_get('name', inp_port.node.id)
# only constant as for zero_point/scale supported
if q0_scale.soft_get('type') == 'Const' and q1_scale.soft_get('type') == 'Const' and \
q0_zerop.soft_get('type') == 'Const' and q1_zerop.soft_get('type') == 'Const' and \
q2_zerop.soft_get('type') == 'Const' and q2_zerop.soft_get('type') == 'Const' and \
q3_zerop.soft_get('type') == 'Const' and q3_zerop.soft_get('type') == 'Const':
# only patterns with same scale/zero_point values for Q and DQ are supported
if q0_scale.value == q1_scale.value and q0_zerop.value == q1_zerop.value and \
q0_scale.value == q2_scale.value and q0_zerop.value == q2_zerop.value and \
q0_scale.value == q3_scale.value and q0_zerop.value == q3_zerop.value:
log.debug('Redundant 4Q-DQ pattern after {}'.format(name))
dests = match['dequantize1'].out_port(0).get_destinations()
for dest in dests:
dest.disconnect()
dest.connect(match['dequantize0'].out_port(0))
graph.remove_nodes_from([match['quantize1'].id, match['dequantize1'].id])
dests = match['dequantize2'].out_port(0).get_destinations()
for dest in dests:
dest.disconnect()
dest.connect(match['dequantize0'].out_port(0))
graph.remove_nodes_from([match['quantize2'].id, match['dequantize2'].id])
dests = match['dequantize3'].out_port(0).get_destinations()
for dest in dests:
dest.disconnect()
dest.connect(match['dequantize0'].out_port(0))
graph.remove_nodes_from([match['quantize3'].id, match['dequantize3'].id])
else:
log.error('QuantizeLinears in the fan-out have different scale or zero-point values, '
'cannot removed!'.format(name))