Compare commits
7 Commits
2023.2.0.d
...
releases/2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
834755680d | ||
|
|
c8b783f644 | ||
|
|
1e6ca0627a | ||
|
|
05a57ebd8e | ||
|
|
e8a178e196 | ||
|
|
0aead5c070 | ||
|
|
dcfaeedb6f |
@@ -189,6 +189,15 @@ DECLARE_CONFIG_VALUE(NO);
|
||||
*/
|
||||
DECLARE_CONFIG_KEY(CPU_THREADS_NUM);
|
||||
|
||||
/**
|
||||
* @brief If set, enables dynamic sequence recognition for 1D inputs for the CPU, the value defines the min seq boundary
|
||||
*/
|
||||
DECLARE_CONFIG_KEY(CPU_DYNAMIC_SEQUENCE);
|
||||
/**
|
||||
* @brief Step to pre-reshape for the dynamic sequence recognition
|
||||
*/
|
||||
DECLARE_CONFIG_KEY(CPU_DYNAMIC_SEQUENCE_STEP);
|
||||
|
||||
/**
|
||||
* @brief The name for setting CPU affinity per thread option.
|
||||
*
|
||||
|
||||
@@ -224,7 +224,7 @@ public:
|
||||
(precisionInfo.value == Precision::Q78) || (precisionInfo.value == Precision::I16) ||
|
||||
(precisionInfo.value == Precision::I8) || (precisionInfo.value == Precision::I32) ||
|
||||
(precisionInfo.value == Precision::I64) || (precisionInfo.value == Precision::BIN) ||
|
||||
(precisionInfo.value == Precision::CUSTOM);
|
||||
(precisionInfo.value == Precision::BF16) || (precisionInfo.value == Precision::CUSTOM);
|
||||
}
|
||||
|
||||
protected:
|
||||
|
||||
@@ -13,7 +13,7 @@ namespace MKLDNNPlugin {
|
||||
|
||||
class BF16Transformer {
|
||||
const InferenceEngine::details::caseless_set<std::string> _initbf16 =
|
||||
{ "convolution", "fullyconnected", "innerproduct" };
|
||||
{ "convolution", "fullyconnected", "innerproduct", "gemm" };
|
||||
const InferenceEngine::details::caseless_set<std::string> _complementbf16 =
|
||||
{ "relu", "tanh", "elu", "square", "abs", "sqrt", "linear", "bounded_relu", "soft_relu", "logistic",
|
||||
"exp", "gelu", "clamp", "swish", "prelu", "pooling", "norm", "gather", "memory" };
|
||||
|
||||
@@ -57,6 +57,14 @@ void Config::readProperties(const std::map<std::string, std::string> &prop) {
|
||||
// zero and any negative value will be treated
|
||||
// as default batch size
|
||||
batchLimit = std::max(val_i, 0);
|
||||
} else if (key == PluginConfigParams::KEY_CPU_DYNAMIC_SEQUENCE) {
|
||||
int val_i = std::stoi(val);
|
||||
// zero and any negative value will be treated
|
||||
// as default sequence size, so no auto-reshaping will happen
|
||||
dynamicSequence = std::max(val_i, 0);
|
||||
} else if (key == PluginConfigParams::KEY_CPU_DYNAMIC_SEQUENCE_STEP) {
|
||||
int val_i = std::stoi(val);
|
||||
dynamicSequenceStep = std::max(val_i, 0);
|
||||
} else if (key == PluginConfigParams::KEY_PERF_COUNT) {
|
||||
if (val == PluginConfigParams::YES) collectPerfCounters = true;
|
||||
else if (val == PluginConfigParams::NO) collectPerfCounters = false;
|
||||
@@ -110,6 +118,15 @@ void Config::readProperties(const std::map<std::string, std::string> &prop) {
|
||||
}
|
||||
if (exclusiveAsyncRequests) // Exclusive request feature disables the streams
|
||||
streamExecutorConfig._streams = 1;
|
||||
if (dynamicSequence && !dynamicSequenceStep) {
|
||||
THROW_IE_EXCEPTION << "Dynamic sequence recognition is enabled, but the "
|
||||
<< PluginConfigParams::KEY_CPU_DYNAMIC_SEQUENCE_STEP
|
||||
<< " is not set!";
|
||||
|
||||
} else if (!dynamicSequence && dynamicSequenceStep) {
|
||||
THROW_IE_EXCEPTION << "Dynamic sequence recognition " << PluginConfigParams::KEY_CPU_DYNAMIC_SEQUENCE
|
||||
<< " is not enabled while the " << PluginConfigParams::KEY_CPU_DYNAMIC_SEQUENCE_STEP << " is set!";
|
||||
}
|
||||
|
||||
updateProperties();
|
||||
}
|
||||
|
||||
@@ -21,6 +21,8 @@ struct Config {
|
||||
bool collectPerfCounters = false;
|
||||
bool exclusiveAsyncRequests = false;
|
||||
bool enableDynamicBatch = false;
|
||||
int dynamicSequence = 0;
|
||||
int dynamicSequenceStep = 0;
|
||||
std::string dumpToDot = "";
|
||||
std::string dumpQuantizedGraphToDot = "";
|
||||
std::string dumpQuantizedGraphToIr = "";
|
||||
|
||||
@@ -38,70 +38,76 @@ MKLDNNExecNetwork::CreateInferRequestImpl(InferenceEngine::InputsDataMap network
|
||||
return std::make_shared<MKLDNNInferRequest>(networkInputs, networkOutputs, std::static_pointer_cast<MKLDNNExecNetwork>(shared_from_this()));
|
||||
}
|
||||
|
||||
MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network,
|
||||
MKLDNNExecNetwork::MKLDNNExecNetwork(ReshapedCNNNetworks networks,
|
||||
const Config &cfg,
|
||||
const MKLDNNExtensionManager::Ptr& extMgr,
|
||||
NumaNodesWeights &numaNodesWeights) :
|
||||
InferenceEngine::ExecutableNetworkThreadSafeDefault{nullptr, nullptr},
|
||||
extensionManager(extMgr),
|
||||
_cfg{cfg},
|
||||
_name{network.getName()} {
|
||||
_name{networks.begin()->second.getName()} {
|
||||
OV_ITT_SCOPED_TASK(itt::domains::MKLDNNPlugin, "MKLDNNExecNetwork::MKLDNNExecNetwork");
|
||||
|
||||
// we are cloning network if we have statistics and we can transform network.
|
||||
_clonedNetwork = cloneNet(network);
|
||||
typedef std::map<int, InferenceEngine::details::CNNNetworkImplPtr, sorting_order> PluginInternalNetworks;
|
||||
PluginInternalNetworks plugin_internal_networks;
|
||||
for (auto n : networks) {
|
||||
// we are cloning network if we have statistics and we can transform network.
|
||||
auto _clonedNetwork = cloneNet(n.second);
|
||||
|
||||
if (_cfg.lpTransformsMode == Config::LPTransformsMode::On) {
|
||||
auto params = LayerTransformation::Params(true, // updatePrecisions
|
||||
true, // quantizeOutputs
|
||||
true, // weightsToConst
|
||||
LayerTransformation::QuantizedTensorAlignment::UpdateLevel, // quantizedTensorAlignmentOnActivations
|
||||
LayerTransformation::QuantizedTensorAlignment::None, // quantizedTensorAlignmentOnWeights
|
||||
true, // roundQuantizedValues
|
||||
true, // updateBiases
|
||||
true); // supportAsymmetricQuantization
|
||||
LowPrecisionTransformer transformer(LowPrecisionTransformer::getAllTransformations(params).
|
||||
add<ConvolutionTransformation>(LayerTransformation::Params(params).setPrecisionsOnActivations({ Precision::U8 }), "Convolution").
|
||||
addCleanup<ScaleShiftToConvolutionTransformation>(
|
||||
LayerTransformation::Params(params).setPrecisionsOnActivations({ Precision::U8 }),
|
||||
"ScaleShift"));
|
||||
transformer.transform(*_clonedNetwork);
|
||||
if (_cfg.lpTransformsMode == Config::LPTransformsMode::On) {
|
||||
auto params = LayerTransformation::Params(true, // updatePrecisions
|
||||
true, // quantizeOutputs
|
||||
true, // weightsToConst
|
||||
LayerTransformation::QuantizedTensorAlignment::UpdateLevel, // quantizedTensorAlignmentOnActivations
|
||||
LayerTransformation::QuantizedTensorAlignment::None, // quantizedTensorAlignmentOnWeights
|
||||
true, // roundQuantizedValues
|
||||
true, // updateBiases
|
||||
true); // supportAsymmetricQuantization
|
||||
LowPrecisionTransformer transformer(LowPrecisionTransformer::getAllTransformations(params).
|
||||
add<ConvolutionTransformation>(
|
||||
LayerTransformation::Params(params).setPrecisionsOnActivations({Precision::U8}), "Convolution").
|
||||
addCleanup<ScaleShiftToConvolutionTransformation>(
|
||||
LayerTransformation::Params(params).setPrecisionsOnActivations({Precision::U8}),
|
||||
"ScaleShift"));
|
||||
transformer.transform(*_clonedNetwork);
|
||||
|
||||
// Check if network is INT8 or Binary.
|
||||
// BF16 transformations were disabled since CPU plug-in doesn't support mixed precision execution:
|
||||
// BF16 + INT8 or BF16 + BIN.
|
||||
bool isFloatModel = true;
|
||||
CNNNetworkIterator i(&network);
|
||||
while (i != CNNNetworkIterator()) {
|
||||
if (CaselessEq<std::string>()((*i)->type, "FakeQuantize")) {
|
||||
isFloatModel = false;
|
||||
break;
|
||||
// Check if network is INT8 or Binary.
|
||||
// BF16 transformations were disabled since CPU plug-in doesn't support mixed precision execution:
|
||||
// BF16 + INT8 or BF16 + BIN.
|
||||
bool isFloatModel = true;
|
||||
CNNNetworkIterator i(&n.second.operator InferenceEngine::ICNNNetwork &());
|
||||
while (i != CNNNetworkIterator()) {
|
||||
if (CaselessEq<std::string>()((*i)->type, "FakeQuantize")) {
|
||||
isFloatModel = false;
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
if (with_cpu_x86_bfloat16() && isFloatModel) {
|
||||
BF16Transformer bf16Transformer;
|
||||
CNNNetwork cnnetwork(_clonedNetwork);
|
||||
// If enforceBF16 flag was set, BF16 transformation applies for all layers supported by CPU plugin.
|
||||
// Overwise, only layers marked as BF16 in 'cnnetwork' will be performed in bfloat16 mode.
|
||||
// CPU plugin throws an exception, if marked as BF16 layers have not supported by CPU plugin.
|
||||
if (cfg.enforceBF16 == true)
|
||||
bf16Transformer.convertToBFloat16(cnnetwork);
|
||||
} else {
|
||||
BF16Transformer bf16Transformer;
|
||||
CNNNetwork cnnetwork(_clonedNetwork);
|
||||
bf16Transformer.convertToFloat(cnnetwork);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
if (with_cpu_x86_bfloat16() && isFloatModel) {
|
||||
BF16Transformer bf16Transformer;
|
||||
CNNNetwork cnnetwork(_clonedNetwork);
|
||||
// If enforceBF16 flag was set, BF16 transformation applies for all layers supported by CPU plugin.
|
||||
// Overwise, only layers marked as BF16 in 'cnnetwork' will be performed in bfloat16 mode.
|
||||
// CPU plugin throws an exception, if marked as BF16 layers have not supported by CPU plugin.
|
||||
if (cfg.enforceBF16 == true)
|
||||
bf16Transformer.convertToBFloat16(cnnetwork);
|
||||
} else {
|
||||
BF16Transformer bf16Transformer;
|
||||
CNNNetwork cnnetwork(_clonedNetwork);
|
||||
bf16Transformer.convertToFloat(cnnetwork);
|
||||
}
|
||||
}
|
||||
|
||||
MKLDNNGraph::ApplyUnrollPasses(static_cast<ICNNNetwork&>(*_clonedNetwork));
|
||||
MKLDNNGraph::ApplyUnrollPasses(static_cast<ICNNNetwork &>(*_clonedNetwork));
|
||||
|
||||
if (_cfg.enableDynamicBatch) {
|
||||
// check topology for applicability
|
||||
if (!CanProcessDynBatch(*_clonedNetwork)) {
|
||||
THROW_IE_EXCEPTION << "MKLDNNGraph::CreateGraph: such topology cannot be compiled for dynamic batch!";
|
||||
// check topology for applicability
|
||||
if (!CanProcessDynBatch(*_clonedNetwork)) {
|
||||
THROW_IE_EXCEPTION << "MKLDNNGraph::CreateGraph: such topology cannot be compiled for dynamic batch!";
|
||||
}
|
||||
}
|
||||
plugin_internal_networks[n.first] = _clonedNetwork;
|
||||
}
|
||||
|
||||
if (cfg.exclusiveAsyncRequests) {
|
||||
@@ -131,19 +137,23 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network
|
||||
_graphs = decltype(_graphs){[&] {
|
||||
// TODO: Remove `cloneNet` to `localNetwork` when `MKLDNNGraph::CreateGraph`
|
||||
// is fixed and does not change content of network passed (CVS-26420)
|
||||
auto localNetwork = cloneNet(static_cast<ICNNNetwork&>(*_clonedNetwork));
|
||||
auto graph = std::make_shared<MKLDNNGraph>();
|
||||
{
|
||||
std::unique_lock<std::mutex> lock{_cfgMutex};
|
||||
graph->setConfig(_cfg);
|
||||
SequenceGraphs m;
|
||||
for (auto n : plugin_internal_networks) {
|
||||
auto localNetwork = cloneNet(static_cast<ICNNNetwork&>(*n.second.get()));
|
||||
auto graph = std::make_shared<MKLDNNGraph>();
|
||||
{
|
||||
std::unique_lock<std::mutex> lock{_cfgMutex};
|
||||
graph->setConfig(_cfg);
|
||||
}
|
||||
int numaNode = 0;
|
||||
auto *streamExecutor = dynamic_cast<InferenceEngine::IStreamsExecutor *>(_taskExecutor.get());
|
||||
if (nullptr != streamExecutor) {
|
||||
numaNode = streamExecutor->GetNumaNodeId();
|
||||
}
|
||||
graph->CreateGraph(static_cast<ICNNNetwork &>(*localNetwork), extensionManager, numaNodesWeights[numaNode]);
|
||||
m[n.first] = graph;
|
||||
}
|
||||
int numaNode = 0;
|
||||
auto* streamExecutor = dynamic_cast<InferenceEngine::IStreamsExecutor*>(_taskExecutor.get());
|
||||
if (nullptr != streamExecutor) {
|
||||
numaNode = streamExecutor->GetNumaNodeId();
|
||||
}
|
||||
graph->CreateGraph(static_cast<ICNNNetwork&>(*localNetwork), extensionManager, numaNodesWeights[numaNode]);
|
||||
return graph;
|
||||
return m;
|
||||
}};
|
||||
|
||||
_taskExecutor->runAndWait({std::thread::hardware_concurrency(), [this] {_graphs.local();}});
|
||||
@@ -152,7 +162,7 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network
|
||||
// of MemoryLayer implementation. It uses output edge of MemoryLayer
|
||||
// producer as storage for tensor to keep it between infer calls.
|
||||
if (_graphs.size() == 1) {
|
||||
for (auto &node : _graphs.begin()->get()->GetNodes()) {
|
||||
for (auto &node : _graphs.begin()->begin()->second->GetNodes()) {
|
||||
if (node->getType() == MemoryInput) {
|
||||
auto memoryNode = dynamic_cast<MKLDNNMemoryInputNode*>(node.get());
|
||||
auto state_store = memoryNode->getStore();
|
||||
@@ -174,9 +184,9 @@ void MKLDNNExecNetwork::setProperty(const std::map<std::string, std::string> &pr
|
||||
std::lock_guard<std::mutex> lock{_cfgMutex};
|
||||
_cfg.readProperties(properties);
|
||||
}
|
||||
for (auto g : _graphs) {
|
||||
g->setProperty(properties);
|
||||
}
|
||||
for (auto g : _graphs)
|
||||
for (auto s : g)
|
||||
s.second->setProperty(properties);
|
||||
}
|
||||
|
||||
void MKLDNNExecNetwork::CreateInferRequest(InferenceEngine::IInferRequest::Ptr &asyncRequest) {
|
||||
@@ -193,13 +203,13 @@ void MKLDNNExecNetwork::GetExecGraphInfo(InferenceEngine::ICNNNetwork::Ptr &grap
|
||||
if (_graphs.size() == 0)
|
||||
THROW_IE_EXCEPTION << "No graph was found";
|
||||
|
||||
graphPtr = _graphs.begin()->get()->dump();
|
||||
graphPtr = _graphs.begin()->begin()->second->dump();
|
||||
}
|
||||
|
||||
void MKLDNNExecNetwork::GetConfig(const std::string &name, Parameter &result, ResponseDesc *resp) const {
|
||||
if (_graphs.size() == 0)
|
||||
THROW_IE_EXCEPTION << "No graph was found";
|
||||
Config engConfig = _graphs.begin()->get()->getProperty();
|
||||
Config engConfig = _graphs.begin()->begin()->second->getProperty();
|
||||
auto option = engConfig._config.find(name);
|
||||
if (option != engConfig._config.end()) {
|
||||
result = option->second;
|
||||
@@ -213,9 +223,9 @@ void MKLDNNExecNetwork::GetMetric(const std::string &name, Parameter &result, Re
|
||||
THROW_IE_EXCEPTION << "No graph was found";
|
||||
|
||||
if (name == METRIC_KEY(NETWORK_NAME)) {
|
||||
if (_graphs.begin()->get()->dump() == nullptr)
|
||||
if (_graphs.begin()->begin()->second->dump() == nullptr)
|
||||
THROW_IE_EXCEPTION << "Invalid graph dump";
|
||||
result = IE_SET_METRIC(NETWORK_NAME, _graphs.begin()->get()->dump()->getName());
|
||||
result = IE_SET_METRIC(NETWORK_NAME, _graphs.begin()->begin()->second->dump()->getName());
|
||||
} else if (name == METRIC_KEY(SUPPORTED_METRICS)) {
|
||||
std::vector<std::string> metrics;
|
||||
metrics.push_back(METRIC_KEY(NETWORK_NAME));
|
||||
@@ -225,12 +235,12 @@ void MKLDNNExecNetwork::GetMetric(const std::string &name, Parameter &result, Re
|
||||
result = IE_SET_METRIC(SUPPORTED_METRICS, metrics);
|
||||
} else if (name == METRIC_KEY(SUPPORTED_CONFIG_KEYS)) {
|
||||
std::vector<std::string> configKeys;
|
||||
for (auto && key : _graphs.begin()->get()->getProperty()._config) {
|
||||
for (auto && key : _graphs.begin()->begin()->second->getProperty()._config) {
|
||||
configKeys.push_back(key.first);
|
||||
}
|
||||
result = IE_SET_METRIC(SUPPORTED_CONFIG_KEYS, configKeys);
|
||||
} else if (name == METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS)) {
|
||||
Config engConfig = _graphs.begin()->get()->getProperty();
|
||||
Config engConfig = _graphs.begin()->begin()->second->getProperty();
|
||||
auto option = engConfig._config.find(CONFIG_KEY(CPU_THROUGHPUT_STREAMS));
|
||||
IE_ASSERT(option != engConfig._config.end());
|
||||
auto streams = std::stoi(option->second);
|
||||
|
||||
@@ -18,6 +18,9 @@
|
||||
#include <unordered_map>
|
||||
|
||||
namespace MKLDNNPlugin {
|
||||
typedef std::less<int> sorting_order;
|
||||
typedef std::map<int, MKLDNNGraph::Ptr, sorting_order> SequenceGraphs;
|
||||
typedef std::map<int, InferenceEngine::CNNNetwork, sorting_order> ReshapedCNNNetworks;
|
||||
|
||||
class MKLDNNExecNetwork: public InferenceEngine::ExecutableNetworkThreadSafeDefault {
|
||||
public:
|
||||
@@ -29,7 +32,7 @@ public:
|
||||
|
||||
void CreateInferRequest(InferenceEngine::IInferRequest::Ptr &asyncRequest) override;
|
||||
|
||||
MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network, const Config &cfg,
|
||||
MKLDNNExecNetwork(ReshapedCNNNetworks, const Config &cfg,
|
||||
const MKLDNNExtensionManager::Ptr &extMgr, NumaNodesWeights &weightsSharing);
|
||||
|
||||
~MKLDNNExecNetwork() override = default;
|
||||
@@ -44,13 +47,12 @@ public:
|
||||
|
||||
std::vector<InferenceEngine::IMemoryStateInternal::Ptr> QueryState() override;
|
||||
|
||||
InferenceEngine::ThreadLocal<MKLDNNGraph::Ptr> _graphs;
|
||||
InferenceEngine::ThreadLocal<SequenceGraphs> _graphs;
|
||||
|
||||
protected:
|
||||
friend class MKLDNNInferRequest;
|
||||
MKLDNNExtensionManager::Ptr extensionManager;
|
||||
std::vector<InferenceEngine::IMemoryStateInternal::Ptr> memoryStates;
|
||||
InferenceEngine::details::CNNNetworkImplPtr _clonedNetwork;
|
||||
std::mutex _cfgMutex;
|
||||
Config _cfg;
|
||||
std::atomic_int _numRequests = {0};
|
||||
|
||||
@@ -758,9 +758,14 @@ void MKLDNNGraph::PullOutputData(BlobMap &out) {
|
||||
ext_blob->allocate();
|
||||
}
|
||||
|
||||
if (ext_blob->byteSize() != intr_blob.GetSize())
|
||||
if (config.dynamicSequence) {
|
||||
if (ext_blob->byteSize() < intr_blob.GetSize())
|
||||
THROW_IE_EXCEPTION << "Output blob size is less than network output size ("
|
||||
<< ext_blob->size() << "<" << intr_blob.GetSize() / sizeof(float) << ").";
|
||||
} else if (ext_blob->byteSize() != intr_blob.GetSize()) {
|
||||
THROW_IE_EXCEPTION << "Output blob size is not equal network output size ("
|
||||
<< ext_blob->size() << "!=" << intr_blob.GetSize()/sizeof(float) << ").";
|
||||
<< ext_blob->size() << "!=" << intr_blob.GetSize() / sizeof(float) << ").";
|
||||
}
|
||||
|
||||
void *ext_blob_ptr = ext_blob->buffer();
|
||||
void *intr_blob_ptr = intr_blob.GetData();
|
||||
@@ -776,6 +781,13 @@ void MKLDNNGraph::PullOutputData(BlobMap &out) {
|
||||
size_t size_to_copy = intr_blob.GetSize() * MB_to_process / MB;
|
||||
|
||||
ie_memcpy(ext_blob_ptr, ext_blob->byteSize(), intr_blob_ptr, size_to_copy);
|
||||
if (config.dynamicSequence && ext_blob->size() > intr_blob.GetElementsCount()) {
|
||||
if (ext_blob->getTensorDesc().getPrecision() != InferenceEngine::Precision::FP32)
|
||||
THROW_IE_EXCEPTION << "Dynamic sequence is supported only for the fp32 outputs only!";
|
||||
auto elements = intr_blob.GetElementsCount();
|
||||
std::fill(static_cast<float*>(ext_blob_ptr) + elements,
|
||||
static_cast<float*>(ext_blob_ptr) + ext_blob->size(), -std::numeric_limits<float>::max());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -24,7 +24,10 @@ MKLDNNPlugin::MKLDNNInferRequest::MKLDNNInferRequest(InferenceEngine::InputsData
|
||||
|
||||
if (execNetwork->_graphs.size() == 0)
|
||||
THROW_IE_EXCEPTION << "No graph was found";
|
||||
graph = execNetwork->_graphs.begin()->get();
|
||||
const int seq = execNetwork->_graphs.begin()->size() > 1
|
||||
? _networkInputs.cbegin()->second->getTensorDesc().getDims()[1]
|
||||
: 0;
|
||||
graph = execNetwork->_graphs.begin()->at(seq).get();
|
||||
for (const auto& it : _networkInputs) {
|
||||
InferenceEngine::Blob::Ptr blob;
|
||||
MKLDNNInferRequest::GetBlob(it.first.c_str(), blob);
|
||||
@@ -79,13 +82,28 @@ void copyToFloat(float* dst, const InferenceEngine::Blob* src) {
|
||||
|
||||
void MKLDNNPlugin::MKLDNNInferRequest::InferImpl() {
|
||||
using namespace openvino::itt;
|
||||
const bool dyn_sequence = execNetwork->_graphs.local().size() > 1;
|
||||
auto dims = _inputs.cbegin()->second->getTensorDesc().getDims();
|
||||
OV_ITT_SCOPED_TASK(itt::domains::MKLDNNPlugin, profilingTask);
|
||||
|
||||
graph = execNetwork->_graphs.local().get();
|
||||
if (dyn_sequence) {
|
||||
// graph per sequence
|
||||
const int *ptr = _inputs.cbegin()->second->buffer().as<int *>();
|
||||
auto sz = _inputs.cbegin()->second->size();
|
||||
const int size_non_zero = std::distance(ptr,
|
||||
std::find_if(ptr, ptr + sz, [](int x) { return x == 0; }));
|
||||
const int actual_seq = execNetwork->_graphs.local().lower_bound(size_non_zero)->first;
|
||||
// std::cout << "Last non-zero : " << size_non_zero << ", Actual Seq : " << actual_seq << std::endl;
|
||||
graph = execNetwork->_graphs.local()[actual_seq].get();
|
||||
dims[1] = actual_seq;
|
||||
} else {
|
||||
graph = execNetwork->_graphs.local().begin()->second.get();
|
||||
}
|
||||
|
||||
{
|
||||
execDataPreprocessing(_inputs);
|
||||
|
||||
changeDefaultPtr();
|
||||
if (!dyn_sequence)
|
||||
changeDefaultPtr();
|
||||
|
||||
// need to retain converted blobs until infer finish
|
||||
std::vector<InferenceEngine::Blob::Ptr> convertedInputs;
|
||||
@@ -103,7 +121,16 @@ void MKLDNNPlugin::MKLDNNInferRequest::InferImpl() {
|
||||
pushInput<float>(input.first, input.second);
|
||||
break;
|
||||
case InferenceEngine::Precision::I32:
|
||||
pushInput<int32_t>(input.first, input.second);
|
||||
if (dyn_sequence) {
|
||||
iconv = InferenceEngine::make_shared_blob<int32_t>({InferenceEngine::Precision::I32,
|
||||
dims,
|
||||
input.second->getTensorDesc().getLayout()},
|
||||
input.second->buffer());
|
||||
convertedInputs.push_back(iconv);
|
||||
pushInput<int32_t>(input.first, iconv);
|
||||
} else {
|
||||
pushInput<int32_t>(input.first, input.second);
|
||||
}
|
||||
break;
|
||||
case InferenceEngine::Precision::I8:
|
||||
pushInput<int8_t>(input.first, input.second);
|
||||
|
||||
@@ -158,28 +158,51 @@ Engine::LoadExeNetworkImpl(const InferenceEngine::ICNNNetwork &network, const st
|
||||
conf.batchLimit = static_cast<int>(network.getBatchSize());
|
||||
}
|
||||
|
||||
std::shared_ptr<ICNNNetwork> clonedNetwork = cloneNetwork(network);
|
||||
bool is_transformed = false;
|
||||
if (clonedNetwork->getFunction()) {
|
||||
Transformation(clonedNetwork);
|
||||
is_transformed = true;
|
||||
CNNNetwork localNetwork(cloneNetwork(network));
|
||||
const InputsDataMap inputInfo = localNetwork.getInputsInfo();
|
||||
ICNNNetwork::InputShapes shapes = localNetwork.getInputShapes();
|
||||
ReshapedCNNNetworks reshapedNetworks;
|
||||
int seq = 0;
|
||||
if (conf.dynamicSequence) {
|
||||
if (shapes.at(inputInfo.cbegin()->first).size() < 2)
|
||||
THROW_IE_EXCEPTION << "Auto-reshaping of the network with no sequence (first input is scalar or channels-only)!";
|
||||
seq = shapes.at(inputInfo.cbegin()->first)[1];
|
||||
}
|
||||
auto implNetwork = std::dynamic_pointer_cast<details::CNNNetworkImpl>(clonedNetwork);
|
||||
if (implNetwork) {
|
||||
// valid for CNNNetworkImpl only, while there's no API in ICNNNetwork to change network
|
||||
ConstTransformer transformator(implNetwork.get());
|
||||
transformator.fullTrim();
|
||||
if (!is_transformed) {
|
||||
NetPass::ConvertPrecision(*implNetwork, Precision::I64, Precision::I32);
|
||||
NetPass::ConvertPrecision(*implNetwork, Precision::U64, Precision::I32);
|
||||
NetPass::ConvertPrecision(*implNetwork, Precision::U32, Precision::I32);
|
||||
NetPass::ConvertPrecision(*implNetwork, Precision::FP16, Precision::FP32);
|
||||
NetPass::ConvertPrecision(*implNetwork, Precision::BOOL, Precision::U8);
|
||||
NetPass::ConvertPrecision(*implNetwork, Precision::U16, Precision::I32);
|
||||
do {
|
||||
CNNNetwork clonedNetwork(cloneNetwork(network));
|
||||
if (conf.dynamicSequence) {
|
||||
for (const InputsDataMap::value_type &item : inputInfo)
|
||||
shapes[item.first][1] = seq;
|
||||
// std::cout << "Reshaped network by sequence to " << seq << std::endl;
|
||||
clonedNetwork.reshape(shapes);
|
||||
}
|
||||
}
|
||||
bool is_transformed = false;
|
||||
if (clonedNetwork.getFunction()) {
|
||||
auto temp = clonedNetwork.operator ICNNNetwork::Ptr();
|
||||
Transformation(temp);
|
||||
clonedNetwork = CNNNetwork(temp);
|
||||
is_transformed = true;
|
||||
}
|
||||
auto implNetwork = std::dynamic_pointer_cast<details::CNNNetworkImpl>(
|
||||
clonedNetwork.operator ICNNNetwork::Ptr());
|
||||
if (implNetwork) {
|
||||
// valid for CNNNetworkImpl only, while there's no API in ICNNNetwork to change network
|
||||
ConstTransformer transformator(implNetwork.get());
|
||||
transformator.fullTrim();
|
||||
if (!is_transformed) {
|
||||
NetPass::ConvertPrecision(*implNetwork, Precision::I64, Precision::I32);
|
||||
NetPass::ConvertPrecision(*implNetwork, Precision::U64, Precision::I32);
|
||||
NetPass::ConvertPrecision(*implNetwork, Precision::U32, Precision::I32);
|
||||
NetPass::ConvertPrecision(*implNetwork, Precision::FP16, Precision::FP32);
|
||||
NetPass::ConvertPrecision(*implNetwork, Precision::BOOL, Precision::U8);
|
||||
NetPass::ConvertPrecision(*implNetwork, Precision::U16, Precision::I32);
|
||||
}
|
||||
}
|
||||
reshapedNetworks[seq] = clonedNetwork;
|
||||
seq -= conf.dynamicSequenceStep;
|
||||
} while (conf.dynamicSequence && seq >= conf.dynamicSequence);
|
||||
|
||||
return std::make_shared<MKLDNNExecNetwork>(*clonedNetwork, conf, extensionManager, weightsSharing);
|
||||
return std::make_shared<MKLDNNExecNetwork>(reshapedNetworks, conf, extensionManager, weightsSharing);
|
||||
}
|
||||
|
||||
void Engine::SetConfig(const std::map<std::string, std::string> &config) {
|
||||
|
||||
@@ -122,8 +122,13 @@ void MKLDNNGemmNode::initSupportedPrimitiveDescriptors() {
|
||||
auto inPrec0 = getCnnLayer()->insData[0].lock()->getPrecision();
|
||||
auto inPrec1 = getCnnLayer()->insData[1].lock()->getPrecision();
|
||||
if ((inPrec0 != Precision::U8 && inPrec0 != Precision::I8) || inPrec1 != Precision::I8 || isThreeInputs) {
|
||||
inPrec0 = Precision::FP32;
|
||||
inPrec1 = Precision::FP32;
|
||||
if (inPrec0 == Precision::BF16 || inPrec1 == Precision::BF16) {
|
||||
inPrec0 = Precision::BF16;
|
||||
inPrec1 = Precision::BF16;
|
||||
} else {
|
||||
inPrec0 = Precision::FP32;
|
||||
inPrec1 = Precision::FP32;
|
||||
}
|
||||
}
|
||||
|
||||
auto inputDataType0 = MKLDNNExtensionUtils::IEPrecisionToDataType(inPrec0);
|
||||
@@ -192,6 +197,11 @@ inline void process_gemm(char transa, char transb, int M, int N, int K, float al
|
||||
mkldnn_sgemm(transa, transb, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
|
||||
}
|
||||
|
||||
inline void process_gemm(char transa, char transb, int M, int N, int K, float alpha, const uint16_t *A, int lda,
|
||||
const uint16_t *B, int ldb, float beta, float *C, int ldc) {
|
||||
mkldnn_gemm_bf16bf16f32(transa, transb, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
|
||||
}
|
||||
|
||||
inline void process_gemm(char transa, char transb, int M, int N, int K, float alpha, const uint8_t *A, int lda,
|
||||
const int8_t *B, int ldb, float beta, float *C, int ldc) {
|
||||
const int32_t co = 0;
|
||||
@@ -288,6 +298,9 @@ void MKLDNNGemmNode::execute(mkldnn::stream strm) {
|
||||
case Precision::FP32:
|
||||
process_data<float, float>();
|
||||
break;
|
||||
case Precision::BF16:
|
||||
process_data<uint16_t, uint16_t>();
|
||||
break;
|
||||
case Precision::I8:
|
||||
process_data<int8_t, int8_t>();
|
||||
break;
|
||||
|
||||
@@ -16,7 +16,7 @@ using namespace mkldnn;
|
||||
class MKLDNNTestExecNetwork: public MKLDNNPlugin::MKLDNNExecNetwork {
|
||||
public:
|
||||
MKLDNNPlugin::MKLDNNGraph& getGraph() {
|
||||
return *(_graphs.begin()->get());
|
||||
return *(_graphs.begin()->begin()->second);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -1198,7 +1198,7 @@ TEST_F(MKLDNNGraphStructureTests, TestOutputAfterInplacePlusConcat) {
|
||||
InferenceEngine::Core core;
|
||||
InferenceEngine::CNNNetwork network;
|
||||
ASSERT_NO_THROW(network = core.ReadNetwork(model, InferenceEngine::Blob::CPtr()));
|
||||
MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork(network, {}, {}, cache));
|
||||
MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork({{0, network}}, {}, {}, cache));
|
||||
InferenceEngine::InputsDataMap _networkInputs = network.getInputsInfo();
|
||||
InferenceEngine::OutputsDataMap _networkOutputs = network.getOutputsInfo();
|
||||
execNetwork->setNetworkInputs(_networkInputs);
|
||||
@@ -1714,7 +1714,7 @@ TEST_F(MKLDNNGraphStructureTests, TestResnetPart) {
|
||||
InferenceEngine::CNNNetwork network;
|
||||
ASSERT_NO_THROW(network = core.ReadNetwork(model, weights_ptr));
|
||||
|
||||
MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork(network, {}, {}, cache));
|
||||
MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork({{0, network}}, {}, {}, cache));
|
||||
InferenceEngine::InputsDataMap _networkInputs = network.getInputsInfo();
|
||||
InferenceEngine::OutputsDataMap _networkOutputs = network.getOutputsInfo();
|
||||
execNetwork->setNetworkInputs(_networkInputs);
|
||||
@@ -1864,7 +1864,7 @@ TEST_F(MKLDNNGraphStructureTests, TestConcatAfterConcat) {
|
||||
InferenceEngine::Core core;
|
||||
InferenceEngine::CNNNetwork network;
|
||||
ASSERT_NO_THROW(network = core.ReadNetwork(model, InferenceEngine::Blob::CPtr()));
|
||||
MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork(network, {}, {}, cache));
|
||||
MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork({{0, network}}, {}, {}, cache));
|
||||
InferenceEngine::InputsDataMap _networkInputs = network.getInputsInfo();
|
||||
InferenceEngine::OutputsDataMap _networkOutputs = network.getOutputsInfo();
|
||||
execNetwork->setNetworkInputs(_networkInputs);
|
||||
@@ -2045,7 +2045,7 @@ TEST_F(MKLDNNGraphStructureTests, Test2ConcatFromConcat) {
|
||||
InferenceEngine::Core core;
|
||||
InferenceEngine::CNNNetwork network;
|
||||
ASSERT_NO_THROW(network = core.ReadNetwork(model, InferenceEngine::Blob::CPtr()));
|
||||
MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork(network, {}, {}, cache));
|
||||
MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork({{0, network}}, {}, {}, cache));
|
||||
InferenceEngine::InputsDataMap _networkInputs = network.getInputsInfo();
|
||||
InferenceEngine::OutputsDataMap _networkOutputs = network.getOutputsInfo();
|
||||
execNetwork->setNetworkInputs(_networkInputs);
|
||||
@@ -2377,7 +2377,7 @@ TEST_F(MKLDNNGraphStructureTests, TestLoadTopologyWithConstLayer) {
|
||||
InferenceEngine::CNNNetwork network;
|
||||
ASSERT_NO_THROW(network = core.ReadNetwork(model, weights_ptr));
|
||||
|
||||
MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork(network, {}, {}, cache));
|
||||
MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork({{0, network}}, {}, {}, cache));
|
||||
InferenceEngine::InputsDataMap _networkInputs = network.getInputsInfo();
|
||||
InferenceEngine::OutputsDataMap _networkOutputs = network.getOutputsInfo();
|
||||
execNetwork->setNetworkInputs(_networkInputs);
|
||||
@@ -2525,7 +2525,7 @@ TEST_F(MKLDNNGraphStructureTests, TestLoadTopologyWithEltwiseBeforeConcat) {
|
||||
InferenceEngine::CNNNetwork network;
|
||||
ASSERT_NO_THROW(network = core.ReadNetwork(model, weights_ptr));
|
||||
|
||||
MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork(network, {}, {}, cache));
|
||||
MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork({{0, network}}, {}, {}, cache));
|
||||
InferenceEngine::InputsDataMap _networkInputs = network.getInputsInfo();
|
||||
InferenceEngine::OutputsDataMap _networkOutputs = network.getOutputsInfo();
|
||||
execNetwork->setNetworkInputs(_networkInputs);
|
||||
|
||||
@@ -76,6 +76,7 @@ JitConstants SoftmaxKerneItemsClassOptimized::GetJitConstants(const softmax_para
|
||||
auto jit = SoftmaxItemsClassKernelBase::GetJitConstants(params, kd);
|
||||
|
||||
jit.AddConstant(MakeJitConstant("WORKITEMS_PER_CLASSES", workitems_per_classes));
|
||||
jit.AddConstant(MakeJitConstant("HAS_DRIVER_PROBLEMS", params.engineInfo.bIMADSupport));
|
||||
|
||||
return jit;
|
||||
}
|
||||
|
||||
@@ -63,12 +63,24 @@ KERNEL(softmax_items_class_optimized)(__global INPUT0_TYPE* input, __global OUTP
|
||||
ACCUMULATOR_TYPE denominator = 0.0;
|
||||
for (uint cls = 0; cls < FULL_ITERATIONS_NUM; cls++)
|
||||
{
|
||||
// This is a temporary solution for unresolved problem when ocl kernels compilation step doesn't produce actual binaries
|
||||
// for current kernel but driver doesn't report any errors (JIRA CVS-32211)
|
||||
#if HAS_DRIVER_PROBLEMS
|
||||
data[cls] = data[cls] == max_value ? 1.0 : native_exp(data[cls] - max_value);
|
||||
#else
|
||||
data[cls] = native_exp(data[cls] - max_value);
|
||||
#endif
|
||||
denominator += data[cls];
|
||||
}
|
||||
if(simd_lane < LEFTOVERS)
|
||||
{
|
||||
// This is a temporary solution for unresolved problem when ocl kernels compilation step doesn't produce actual binaries
|
||||
// for current kernel but driver doesn't report any errors (JIRA CVS-32211)
|
||||
#if HAS_DRIVER_PROBLEMS
|
||||
data[DATA_PER_WORKITEM-1] = data[DATA_PER_WORKITEM-1] == max_value ? 1.0 : native_exp(data[DATA_PER_WORKITEM-1] - max_value);
|
||||
#else
|
||||
data[DATA_PER_WORKITEM-1] = native_exp(data[DATA_PER_WORKITEM-1] - max_value);
|
||||
#endif
|
||||
denominator += data[DATA_PER_WORKITEM-1];
|
||||
}
|
||||
|
||||
|
||||
2
inference-engine/thirdparty/mkl-dnn
vendored
2
inference-engine/thirdparty/mkl-dnn
vendored
Submodule inference-engine/thirdparty/mkl-dnn updated: 4f511de56e...b96a54762a
@@ -119,6 +119,7 @@ extensions/front/create_tensor_nodes.py
|
||||
extensions/front/disable_weights_quantize_value_propagation.py
|
||||
extensions/front/div.py
|
||||
extensions/front/eltwise_n.py
|
||||
extensions/front/EmbeddingBagFuse.py
|
||||
extensions/front/ExpandDimsToUnsqueeze.py
|
||||
extensions/front/FillToBroadcast.py
|
||||
extensions/front/flatten_to_reshape.py
|
||||
@@ -286,6 +287,7 @@ extensions/front/onnx/priorbox_ext.py
|
||||
extensions/front/onnx/priorgridgenerator_ext.py
|
||||
extensions/front/onnx/proposal_ext.py
|
||||
extensions/front/onnx/quantize_dequantize_linear.py
|
||||
extensions/front/onnx/quantize_dequantize_redundant.py
|
||||
extensions/front/onnx/quantize_ext.py
|
||||
extensions/front/onnx/quantize_linear_ext.py
|
||||
extensions/front/onnx/quantize_linear_resolver.py
|
||||
|
||||
91
model-optimizer/extensions/front/EmbeddingBagFuse.py
Normal file
91
model-optimizer/extensions/front/EmbeddingBagFuse.py
Normal file
@@ -0,0 +1,91 @@
|
||||
"""
|
||||
Copyright (C) 2020 Intel Corporation
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
"""
|
||||
|
||||
from extensions.ops.embedding_bag import EmbeddingBagOffsetsSum
|
||||
from mo.front.common.replacement import FrontReplacementSubgraph
|
||||
from mo.graph.graph import Graph, rename_nodes
|
||||
|
||||
|
||||
class EmbeddingBagFuse(FrontReplacementSubgraph):
|
||||
enabled = True
|
||||
|
||||
def run_after(self):
|
||||
from extensions.front.ExpandDimsToUnsqueeze import ExpandDimsToUnsqueeze
|
||||
from extensions.front.AttributedGatherNormalizer import AttributedGatherNormalizer
|
||||
return [ExpandDimsToUnsqueeze, AttributedGatherNormalizer]
|
||||
|
||||
def pattern(self):
|
||||
return dict(
|
||||
nodes=[
|
||||
('weights', dict(op='Const')),
|
||||
('concat_before', dict(op='Concat')),
|
||||
('gather_before1_1', dict(op='Gather')),
|
||||
('unsqueeze_before1_1', dict(op='Unsqueeze')),
|
||||
('gather_before2_1', dict(op='Gather')),
|
||||
('unsqueeze_before2_1', dict(op='Unsqueeze')),
|
||||
('slice1', dict(op='Slice')),
|
||||
('gather_after1', dict(op='Gather')),
|
||||
('reduce1', dict(op='ReduceSum')),
|
||||
('unsqueeze_after1', dict(op='Unsqueeze')),
|
||||
('concat_after', dict(op='Concat')),
|
||||
],
|
||||
edges=[
|
||||
('concat_before', 'gather_before1_1'),
|
||||
('concat_before', 'gather_before2_1'),
|
||||
('gather_before1_1', 'unsqueeze_before1_1'),
|
||||
('gather_before2_1', 'unsqueeze_before2_1'),
|
||||
('unsqueeze_before1_1', 'slice1', {'out': 0, 'in': 1}),
|
||||
('unsqueeze_before2_1', 'slice1', {'out': 0, 'in': 2}),
|
||||
('weights', 'gather_after1', {'out': 0, 'in': 0}),
|
||||
('slice1', 'gather_after1', {'out': 0, 'in': 1}),
|
||||
('gather_after1', 'reduce1'),
|
||||
('reduce1', 'unsqueeze_after1'),
|
||||
('unsqueeze_after1', 'concat_after'),
|
||||
])
|
||||
|
||||
def replace_sub_graph(self, graph: Graph, match: dict):
|
||||
concat_before = match['concat_before']
|
||||
gather_after1 = match['gather_after1']
|
||||
slice1 = match['slice1']
|
||||
concat_after = match['concat_after']
|
||||
weights_node = gather_after1.in_port(0).get_source().node
|
||||
gather_after_axis = gather_after1.in_port(2).get_source().node.soft_get('value')
|
||||
for dst_port in weights_node.out_port(0).get_destinations():
|
||||
node = dst_port.node
|
||||
if node.op == 'Gather':
|
||||
# validate that all Gathers have same axis
|
||||
if node.in_port(2).get_source().node.soft_get('value') != gather_after_axis:
|
||||
return
|
||||
dst_port.disconnect()
|
||||
indices_node = slice1.in_port(0).get_source().node
|
||||
slice_axis = slice1.in_port(3).get_source().node.soft_get('value')
|
||||
for dst_port in indices_node.out_port(0).get_destinations():
|
||||
node = dst_port.node
|
||||
if node.op == 'Slice':
|
||||
# validate that all Slices have same axis
|
||||
if node.in_port(3).get_source().node.soft_get('value') != slice_axis:
|
||||
return
|
||||
dst_port.disconnect()
|
||||
emb_bag = EmbeddingBagOffsetsSum(graph, {}).create_node()
|
||||
weights_node.out_port(0).connect(emb_bag.in_port(0))
|
||||
indices_node.out_port(0).connect(emb_bag.in_port(1))
|
||||
concat_before.in_port(0).get_connection().set_destination(emb_bag.in_port(2))
|
||||
concat_after.out_port(0).get_connection().set_source(emb_bag.out_port(0))
|
||||
concat_name = concat_after.soft_get('name', concat_after.id)
|
||||
rename_nodes([(concat_after, concat_name + '/TBD'), (emb_bag, concat_name)])
|
||||
|
||||
# remove this sub-graph since a lot of matchings will be obsolete
|
||||
graph.remove_nodes_from(graph.dfs(concat_before.id, set()))
|
||||
@@ -0,0 +1,171 @@
|
||||
"""
|
||||
Copyright (C) 2018-2020 Intel Corporation
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
"""
|
||||
|
||||
import logging as log
|
||||
|
||||
from mo.front.common.replacement import FrontReplacementSubgraph
|
||||
from mo.front.subgraph_matcher import SubgraphMatch
|
||||
from mo.graph.graph import Graph
|
||||
|
||||
|
||||
class QuantizeDequantizeRedundant2(FrontReplacementSubgraph):
|
||||
"""
|
||||
Fuses duplicated QuantizeLinear and DequantizeLinear nodes
|
||||
(redundancy in the official NV's int8 MLPerf BERT model)
|
||||
Covers cases when the values for zero point and scale are same in both QuantizeLinear and DequantizeLinear.
|
||||
"""
|
||||
enabled = True
|
||||
|
||||
def run_before(self):
|
||||
from extensions.front.onnx.quantize_dequantize_linear import QuantizeDequantizeLinear
|
||||
return [QuantizeDequantizeLinear]
|
||||
|
||||
def pattern(self):
|
||||
return dict(
|
||||
nodes=[
|
||||
('inp', dict(op='Add')),
|
||||
('quantize0', dict(op='QuantizeLinear')),
|
||||
('dequantize0', dict(op='DequantizeLinear')),
|
||||
('quantize1', dict(op='QuantizeLinear')),
|
||||
('dequantize1', dict(op='DequantizeLinear')),
|
||||
],
|
||||
edges=[
|
||||
('inp', 'quantize0', {'in': 0}),
|
||||
('inp', 'quantize1', {'in': 0}),
|
||||
('quantize0', 'dequantize0', {'in': 0}),
|
||||
('quantize1', 'dequantize1', {'in': 0}),
|
||||
]
|
||||
)
|
||||
|
||||
def replace_sub_graph(self, graph: Graph, match: [dict, SubgraphMatch]):
|
||||
|
||||
q0 = match['quantize0']
|
||||
q1 = match['quantize1']
|
||||
|
||||
q0_scale = q0.in_port(1).get_source().node
|
||||
q0_zerop = q0.in_port(2).get_source().node
|
||||
q1_scale = q1.in_port(1).get_source().node
|
||||
q1_zerop = q1.in_port(2).get_source().node
|
||||
|
||||
inp_port = q0.in_port(0).get_source()
|
||||
name = inp_port.node.soft_get('name', inp_port.node.id)
|
||||
|
||||
# only constant as for zero_point/scale supported
|
||||
if q0_scale.soft_get('type') == 'Const' and q1_scale.soft_get('type') == 'Const' and \
|
||||
q0_zerop.soft_get('type') == 'Const' and q1_zerop.soft_get('type') == 'Const':
|
||||
|
||||
# only patterns with same scale/zero_point values for Q and DQ are supported
|
||||
if q0_scale.value == q1_scale.value and q0_zerop.value == q1_zerop.value:
|
||||
log.debug('Redundant 2Q-DQ pattern after {}'.format(name))
|
||||
|
||||
dests = match['dequantize1'].out_port(0).get_destinations()
|
||||
for dest in dests:
|
||||
dest.disconnect()
|
||||
dest.connect(match['dequantize0'].out_port(0))
|
||||
graph.remove_nodes_from([match['quantize1'].id, match['dequantize1'].id])
|
||||
else:
|
||||
log.error('QuantizeLinears in the fan-out have different scale or zero-point values, '
|
||||
'cannot removed!'.format(name))
|
||||
|
||||
|
||||
class QuantizeDequantizeRedundant4(FrontReplacementSubgraph):
|
||||
"""
|
||||
Fuses duplicated QuantizeLinear and DequantizeLinear nodes
|
||||
(redundancy in the official NV's int8 MLPerf BERT model)
|
||||
Covers cases when the values for zero point and scale are same in both QuantizeLinear and DequantizeLinear.
|
||||
"""
|
||||
enabled = True
|
||||
|
||||
def run_before(self):
|
||||
return [QuantizeDequantizeRedundant2]
|
||||
|
||||
def pattern(self):
|
||||
return dict(
|
||||
nodes=[
|
||||
('inp', dict(op='Add')),
|
||||
('quantize0', dict(op='QuantizeLinear')),
|
||||
('dequantize0', dict(op='DequantizeLinear')),
|
||||
('quantize1', dict(op='QuantizeLinear')),
|
||||
('dequantize1', dict(op='DequantizeLinear')),
|
||||
('quantize2', dict(op='QuantizeLinear')),
|
||||
('dequantize2', dict(op='DequantizeLinear')),
|
||||
('quantize3', dict(op='QuantizeLinear')),
|
||||
('dequantize3', dict(op='DequantizeLinear')),
|
||||
],
|
||||
edges=[
|
||||
('inp', 'quantize0', {'in': 0}),
|
||||
('inp', 'quantize1', {'in': 0}),
|
||||
('inp', 'quantize2', {'in': 0}),
|
||||
('inp', 'quantize3', {'in': 0}),
|
||||
('quantize0', 'dequantize0', {'in': 0}),
|
||||
('quantize1', 'dequantize1', {'in': 0}),
|
||||
('quantize2', 'dequantize2', {'in': 0}),
|
||||
('quantize3', 'dequantize3', {'in': 0}),
|
||||
]
|
||||
)
|
||||
|
||||
def replace_sub_graph(self, graph: Graph, match: [dict, SubgraphMatch]):
|
||||
|
||||
q0 = match['quantize0']
|
||||
q1 = match['quantize1']
|
||||
q2 = match['quantize2']
|
||||
q3 = match['quantize3']
|
||||
|
||||
q0_scale = q0.in_port(1).get_source().node
|
||||
q0_zerop = q0.in_port(2).get_source().node
|
||||
q1_scale = q1.in_port(1).get_source().node
|
||||
q1_zerop = q1.in_port(2).get_source().node
|
||||
q2_scale = q2.in_port(1).get_source().node
|
||||
q2_zerop = q2.in_port(2).get_source().node
|
||||
q3_scale = q3.in_port(1).get_source().node
|
||||
q3_zerop = q3.in_port(2).get_source().node
|
||||
|
||||
inp_port = q0.in_port(0).get_source()
|
||||
name = inp_port.node.soft_get('name', inp_port.node.id)
|
||||
|
||||
# only constant as for zero_point/scale supported
|
||||
if q0_scale.soft_get('type') == 'Const' and q1_scale.soft_get('type') == 'Const' and \
|
||||
q0_zerop.soft_get('type') == 'Const' and q1_zerop.soft_get('type') == 'Const' and \
|
||||
q2_zerop.soft_get('type') == 'Const' and q2_zerop.soft_get('type') == 'Const' and \
|
||||
q3_zerop.soft_get('type') == 'Const' and q3_zerop.soft_get('type') == 'Const':
|
||||
|
||||
# only patterns with same scale/zero_point values for Q and DQ are supported
|
||||
if q0_scale.value == q1_scale.value and q0_zerop.value == q1_zerop.value and \
|
||||
q0_scale.value == q2_scale.value and q0_zerop.value == q2_zerop.value and \
|
||||
q0_scale.value == q3_scale.value and q0_zerop.value == q3_zerop.value:
|
||||
log.debug('Redundant 4Q-DQ pattern after {}'.format(name))
|
||||
|
||||
dests = match['dequantize1'].out_port(0).get_destinations()
|
||||
for dest in dests:
|
||||
dest.disconnect()
|
||||
dest.connect(match['dequantize0'].out_port(0))
|
||||
graph.remove_nodes_from([match['quantize1'].id, match['dequantize1'].id])
|
||||
|
||||
dests = match['dequantize2'].out_port(0).get_destinations()
|
||||
for dest in dests:
|
||||
dest.disconnect()
|
||||
dest.connect(match['dequantize0'].out_port(0))
|
||||
graph.remove_nodes_from([match['quantize2'].id, match['dequantize2'].id])
|
||||
|
||||
dests = match['dequantize3'].out_port(0).get_destinations()
|
||||
for dest in dests:
|
||||
dest.disconnect()
|
||||
dest.connect(match['dequantize0'].out_port(0))
|
||||
graph.remove_nodes_from([match['quantize3'].id, match['dequantize3'].id])
|
||||
|
||||
else:
|
||||
log.error('QuantizeLinears in the fan-out have different scale or zero-point values, '
|
||||
'cannot removed!'.format(name))
|
||||
Reference in New Issue
Block a user