From 4a4c3e8ec9e6e53695ee40d9037c6502a879ef0f Mon Sep 17 00:00:00 2001 From: Mikhail Nosov Date: Wed, 23 Jun 2021 09:33:50 +0300 Subject: [PATCH] [DOC] Model caching feature overview (#5519) * Docs: Model caching feature overview * Update docs/IE_DG/Intro_to_Performance.md Co-authored-by: Anastasiya Ageeva * Apply suggestions from code review Co-authored-by: Anastasiya Ageeva * Review comments - Moved code examples to snippets - Added link to Model Caching overview from "Inference Engine Developer Guide" - Few minor changes * Update docs/IE_DG/Intro_to_Performance.md Co-authored-by: Anastasiya Ageeva Co-authored-by: Anastasiya Ageeva --- docs/IE_DG/Intro_to_Performance.md | 6 ++ docs/IE_DG/Model_caching_overview.md | 65 ++++++++++++++++++++++ docs/doxygen/ie_docs.xml | 1 + docs/img/caching_enabled.png | 3 + docs/img/caching_times.png | 3 + docs/snippets/InferenceEngine_Caching0.cpp | 17 ++++++ docs/snippets/InferenceEngine_Caching1.cpp | 13 +++++ docs/snippets/InferenceEngine_Caching2.cpp | 14 +++++ docs/snippets/InferenceEngine_Caching3.cpp | 20 +++++++ 9 files changed, 142 insertions(+) create mode 100644 docs/IE_DG/Model_caching_overview.md create mode 100644 docs/img/caching_enabled.png create mode 100644 docs/img/caching_times.png create mode 100644 docs/snippets/InferenceEngine_Caching0.cpp create mode 100644 docs/snippets/InferenceEngine_Caching1.cpp create mode 100644 docs/snippets/InferenceEngine_Caching2.cpp create mode 100644 docs/snippets/InferenceEngine_Caching3.cpp diff --git a/docs/IE_DG/Intro_to_Performance.md b/docs/IE_DG/Intro_to_Performance.md index 0c9457ed4bf..48d1ea5c56c 100644 --- a/docs/IE_DG/Intro_to_Performance.md +++ b/docs/IE_DG/Intro_to_Performance.md @@ -31,6 +31,12 @@ input images to achieve optimal throughput. However, high batch size also comes latency penalty. So, for more real-time oriented usages, lower batch sizes (as low as a single input) are used. Refer to the [Benchmark App](../../inference-engine/samples/benchmark_app/README.md) sample, which allows latency vs. throughput measuring. +## Using Caching API for first inference latency optimization +Since with the 2021.4 release, Inference Engine provides an ability to enable internal caching of loaded networks. +This can significantly reduce load network latency for some devices at application startup. +Internally caching uses plugin's Export/ImportNetwork flow, like it is done for [Compile tool](../../inference-engine/tools/compile_tool/README.md), using the regular ReadNetwork/LoadNetwork API. +Refer to the [Model Caching Overview](Model_caching_overview.md) for more detailed explanation. + ## Using Async API To gain better performance on accelerators, such as VPU, the Inference Engine uses the asynchronous approach (see [Integrating Inference Engine in Your Application (current API)](Integrate_with_customer_application_new_API.md)). diff --git a/docs/IE_DG/Model_caching_overview.md b/docs/IE_DG/Model_caching_overview.md new file mode 100644 index 00000000000..25ae7387c24 --- /dev/null +++ b/docs/IE_DG/Model_caching_overview.md @@ -0,0 +1,65 @@ +# Model Caching Overview {#openvino_docs_IE_DG_Model_caching_overview} + +## Introduction + +As described in [Inference Engine Introduction](inference_engine_intro.md), common application flow consists of the following steps: + +1. **Create Inference Engine Core object** + +2. **Read the Intermediate Representation** - Read an Intermediate Representation file into an object of the `InferenceEngine::CNNNetwork` + +3. **Prepare inputs and outputs** + +4. **Set configuration** Pass device-specific loading configurations to the device + +5. **Compile and Load Network to device** - Use the `InferenceEngine::Core::LoadNetwork()` method with specific device + +6. **Set input data** + +7. **Execute** + +Step #5 can potentially perform several time-consuming device-specific optimizations and network compilations, +and such delays can lead to bad user experience on application startup. To avoid this, some devices offer +Import/Export network capability, and it is possible to either use [Compile tool](../../inference-engine/tools/compile_tool/README.md) +or enable model caching to export compiled network automatically. Reusing cached networks can significantly reduce load network time. + + +## Set "CACHE_DIR" config option to enable model caching + +To enable model caching, the application must specify the folder where to store cached blobs. It can be done like this + + +@snippet snippets/InferenceEngine_Caching0.cpp part0 + +With this code, if device supports Import/Export network capability, cached blob is automatically created inside the `myCacheFolder` folder +CACHE_DIR config is set to the Core object. If device does not support Import/Export capability, cache is just not created and no error is thrown + +Depending on your device, total time for loading network on application startup can be significantly reduced. +Please also note that very first LoadNetwork (when cache is not yet created) takes slightly longer time to 'export' compiled blob into a cache file +![caching_enabled] + +## Even faster: use LoadNetwork(modelPath) + +In some cases, applications do not need to customize inputs and outputs every time. Such applications always +call `cnnNet = ie.ReadNetwork(...)`, then `ie.LoadNetwork(cnnNet, ..)` and it can be further optimized. +For such cases, more convenient API to load network in one call is introduced in the 2021.4 release. + +@snippet snippets/InferenceEngine_Caching1.cpp part1 + +With enabled model caching, total load time is even smaller - in case that ReadNetwork is optimized as well + +@snippet snippets/InferenceEngine_Caching2.cpp part2 + +![caching_times] + + +## Advanced examples + +Not every device supports network import/export capability, enabling of caching for such devices do not have any effect. +To check in advance if a particular device supports model caching, your application can use the following code: + +@snippet snippets/InferenceEngine_Caching3.cpp part3 + + +[caching_enabled]: ../img/caching_enabled.png +[caching_times]: ../img/caching_times.png diff --git a/docs/doxygen/ie_docs.xml b/docs/doxygen/ie_docs.xml index b50832954d4..19a87a1e11e 100644 --- a/docs/doxygen/ie_docs.xml +++ b/docs/doxygen/ie_docs.xml @@ -285,6 +285,7 @@ limitations under the License. + diff --git a/docs/img/caching_enabled.png b/docs/img/caching_enabled.png new file mode 100644 index 00000000000..f8a898764e1 --- /dev/null +++ b/docs/img/caching_enabled.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:488a7a47e5086a6868c22219bc9d58a3508059e5a1dc470f2653a12552dea82f +size 36207 diff --git a/docs/img/caching_times.png b/docs/img/caching_times.png new file mode 100644 index 00000000000..11d9c8b088f --- /dev/null +++ b/docs/img/caching_times.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2eed189f9cb3d30fe13b4ba4515edd4e6da5d01545660e65fa8a33d945967281 +size 28894 diff --git a/docs/snippets/InferenceEngine_Caching0.cpp b/docs/snippets/InferenceEngine_Caching0.cpp new file mode 100644 index 00000000000..5311a3d0bb6 --- /dev/null +++ b/docs/snippets/InferenceEngine_Caching0.cpp @@ -0,0 +1,17 @@ +#include + +int main() { +using namespace InferenceEngine; + std::string modelPath = "/tmp/myModel.xml"; + std::string device = "GNA"; + std::map deviceConfig; +//! [part0] + InferenceEngine::Core ie; // Step 1: create Inference engine object + ie.SetConfig({{CONFIG_KEY(CACHE_DIR), "myCacheFolder"}}); // Step 1b: Enable caching + auto cnnNet = ie.ReadNetwork(modelPath); // Step 2: ReadNetwork + //... // Step 3: Prepare inputs/outputs + //... // Step 4: Set device configuration + ie.LoadNetwork(cnnNet, device, deviceConfig); // Step 5: LoadNetwork +//! [part0] +return 0; +} diff --git a/docs/snippets/InferenceEngine_Caching1.cpp b/docs/snippets/InferenceEngine_Caching1.cpp new file mode 100644 index 00000000000..3c9d0c5b22d --- /dev/null +++ b/docs/snippets/InferenceEngine_Caching1.cpp @@ -0,0 +1,13 @@ +#include + +int main() { +using namespace InferenceEngine; + std::string modelPath = "/tmp/myModel.xml"; + std::string device = "GNA"; + std::map deviceConfig; +//! [part1] + InferenceEngine::Core ie; // Step 1: create Inference engine object + ie.LoadNetwork(modelPath, device, deviceConfig); // Step 2: LoadNetwork by model file path +//! [part1] +return 0; +} diff --git a/docs/snippets/InferenceEngine_Caching2.cpp b/docs/snippets/InferenceEngine_Caching2.cpp new file mode 100644 index 00000000000..aaf4b33c10d --- /dev/null +++ b/docs/snippets/InferenceEngine_Caching2.cpp @@ -0,0 +1,14 @@ +#include + +int main() { +using namespace InferenceEngine; + std::string modelPath = "/tmp/myModel.xml"; + std::string device = "GNA"; + std::map deviceConfig; +//! [part2] + InferenceEngine::Core ie; // Step 1: create Inference engine object + ie.SetConfig({{CONFIG_KEY(CACHE_DIR), "myCacheFolder"}}); // Step 1b: Enable caching + ie.LoadNetwork(modelPath, device, deviceConfig); // Step 2: LoadNetwork by model file path +//! [part2] +return 0; +} diff --git a/docs/snippets/InferenceEngine_Caching3.cpp b/docs/snippets/InferenceEngine_Caching3.cpp new file mode 100644 index 00000000000..ce91a798552 --- /dev/null +++ b/docs/snippets/InferenceEngine_Caching3.cpp @@ -0,0 +1,20 @@ +#include + +int main() { +using namespace InferenceEngine; + std::string modelPath = "/tmp/myModel.xml"; + std::string deviceName = "GNA"; + std::map deviceConfig; + InferenceEngine::Core ie; +//! [part3] + // Get list of supported metrics + std::vector keys = ie.GetMetric(deviceName, METRIC_KEY(SUPPORTED_METRICS)); + + // Find 'IMPORT_EXPORT_SUPPORT' metric in supported metrics + auto it = std::find(keys.begin(), keys.end(), METRIC_KEY(IMPORT_EXPORT_SUPPORT)); + + // If metric 'IMPORT_EXPORT_SUPPORT' exists, check it's value + bool cachingSupported = (it != keys.end()) && ie.GetMetric(deviceName, METRIC_KEY(IMPORT_EXPORT_SUPPORT)); +//! [part3] + return 0; +}