From 4a4c3e8ec9e6e53695ee40d9037c6502a879ef0f Mon Sep 17 00:00:00 2001
From: Mikhail Nosov <mikhail.nosov@intel.com>
Date: Wed, 23 Jun 2021 09:33:50 +0300
Subject: [PATCH] [DOC] Model caching feature overview (#5519)

* Docs: Model caching feature overview

* Update docs/IE_DG/Intro_to_Performance.md

Co-authored-by: Anastasiya Ageeva <anastasiya.ageeva@intel.com>

* Apply suggestions from code review

Co-authored-by: Anastasiya Ageeva <anastasiya.ageeva@intel.com>

* Review comments
- Moved code examples to snippets
- Added link to Model Caching overview from "Inference Engine Developer Guide"
- Few minor changes

* Update docs/IE_DG/Intro_to_Performance.md

Co-authored-by: Anastasiya Ageeva <anastasiya.ageeva@intel.com>

Co-authored-by: Anastasiya Ageeva <anastasiya.ageeva@intel.com>
---
 docs/IE_DG/Intro_to_Performance.md         |  6 ++
 docs/IE_DG/Model_caching_overview.md       | 65 ++++++++++++++++++++++
 docs/doxygen/ie_docs.xml                   |  1 +
 docs/img/caching_enabled.png               |  3 +
 docs/img/caching_times.png                 |  3 +
 docs/snippets/InferenceEngine_Caching0.cpp | 17 ++++++
 docs/snippets/InferenceEngine_Caching1.cpp | 13 +++++
 docs/snippets/InferenceEngine_Caching2.cpp | 14 +++++
 docs/snippets/InferenceEngine_Caching3.cpp | 20 +++++++
 9 files changed, 142 insertions(+)
 create mode 100644 docs/IE_DG/Model_caching_overview.md
 create mode 100644 docs/img/caching_enabled.png
 create mode 100644 docs/img/caching_times.png
 create mode 100644 docs/snippets/InferenceEngine_Caching0.cpp
 create mode 100644 docs/snippets/InferenceEngine_Caching1.cpp
 create mode 100644 docs/snippets/InferenceEngine_Caching2.cpp
 create mode 100644 docs/snippets/InferenceEngine_Caching3.cpp

diff --git a/docs/IE_DG/Intro_to_Performance.md b/docs/IE_DG/Intro_to_Performance.md
index 0c9457ed4bf..48d1ea5c56c 100644
--- a/docs/IE_DG/Intro_to_Performance.md
+++ b/docs/IE_DG/Intro_to_Performance.md
@@ -31,6 +31,12 @@ input images to achieve optimal throughput. However, high batch size also comes
 latency penalty. So, for more real-time oriented usages, lower batch sizes (as low as a single input) are used.
 Refer to the [Benchmark App](../../inference-engine/samples/benchmark_app/README.md) sample, which allows latency vs. throughput measuring.
 
+## Using Caching API for first inference latency optimization
+Since with the 2021.4 release, Inference Engine provides an ability to enable internal caching of loaded networks.
+This can significantly reduce load network latency for some devices at application startup.
+Internally caching uses plugin's Export/ImportNetwork flow, like it is done for [Compile tool](../../inference-engine/tools/compile_tool/README.md), using the regular ReadNetwork/LoadNetwork API.
+Refer to the [Model Caching Overview](Model_caching_overview.md) for more detailed explanation.
+
 ## Using Async API
 To gain better performance on accelerators, such as VPU, the Inference Engine uses the asynchronous approach (see
 [Integrating Inference Engine in Your Application (current API)](Integrate_with_customer_application_new_API.md)).
diff --git a/docs/IE_DG/Model_caching_overview.md b/docs/IE_DG/Model_caching_overview.md
new file mode 100644
index 00000000000..25ae7387c24
--- /dev/null
+++ b/docs/IE_DG/Model_caching_overview.md
@@ -0,0 +1,65 @@
+# Model Caching Overview {#openvino_docs_IE_DG_Model_caching_overview}
+
+## Introduction
+
+As described in [Inference Engine Introduction](inference_engine_intro.md), common application flow consists of the following steps:
+
+1. **Create Inference Engine Core object**
+
+2. **Read the Intermediate Representation** - Read an Intermediate Representation file into an object of the `InferenceEngine::CNNNetwork`
+
+3. **Prepare inputs and outputs**
+
+4. **Set configuration** Pass device-specific loading configurations to the device
+
+5. **Compile and Load Network to device** - Use the `InferenceEngine::Core::LoadNetwork()` method with specific device
+
+6. **Set input data**
+
+7. **Execute**
+
+Step #5 can potentially perform several time-consuming device-specific optimizations and network compilations,
+and such delays can lead to bad user experience on application startup. To avoid this, some devices offer
+Import/Export network capability, and it is possible to either use [Compile tool](../../inference-engine/tools/compile_tool/README.md)
+or enable model caching to export compiled network automatically. Reusing cached networks can significantly reduce load network time.
+
+
+## Set "CACHE_DIR" config option to enable model caching
+
+To enable model caching, the application must specify the folder where to store cached blobs. It can be done like this
+
+
+@snippet snippets/InferenceEngine_Caching0.cpp part0
+
+With this code, if device supports Import/Export network capability, cached blob is automatically created inside the `myCacheFolder` folder
+CACHE_DIR config is set to the Core object. If device does not support Import/Export capability, cache is just not created and no error is thrown
+
+Depending on your device, total time for loading network on application startup can be significantly reduced.
+Please also note that very first LoadNetwork (when cache is not yet created) takes slightly longer time to 'export' compiled blob into a cache file
+![caching_enabled]
+
+## Even faster: use LoadNetwork(modelPath)
+
+In some cases, applications do not need to customize inputs and outputs every time. Such applications always
+call `cnnNet = ie.ReadNetwork(...)`, then `ie.LoadNetwork(cnnNet, ..)` and it can be further optimized.
+For such cases, more convenient API to load network in one call is introduced in the 2021.4 release.
+
+@snippet snippets/InferenceEngine_Caching1.cpp part1
+
+With enabled model caching, total load time is even smaller - in case that ReadNetwork is optimized as well
+
+@snippet snippets/InferenceEngine_Caching2.cpp part2
+
+![caching_times]
+
+
+## Advanced examples
+
+Not every device supports network import/export capability, enabling of caching for such devices do not have any effect.
+To check in advance if a particular device supports model caching, your application can use the following code:
+
+@snippet snippets/InferenceEngine_Caching3.cpp part3
+
+
+[caching_enabled]: ../img/caching_enabled.png
+[caching_times]: ../img/caching_times.png
diff --git a/docs/doxygen/ie_docs.xml b/docs/doxygen/ie_docs.xml
index b50832954d4..19a87a1e11e 100644
--- a/docs/doxygen/ie_docs.xml
+++ b/docs/doxygen/ie_docs.xml
@@ -285,6 +285,7 @@ limitations under the License.
                 <tab type="user" title="Inference Engine API Changes History" url="@ref openvino_docs_IE_DG_API_Changes"/>
                 <tab type="user" title="Inference Engine Memory primitives" url="@ref openvino_docs_IE_DG_Memory_primitives"/>
                 <tab type="user" title="Inference Engine Device Query API" url="@ref openvino_docs_IE_DG_InferenceEngine_QueryAPI"/>
+                <tab type="user" title="Inference Engine Model Caching" url="@ref openvino_docs_IE_DG_Model_caching_overview"/>
                 <tab type="usergroup" title="Inference Engine Extensibility Mechanism" url="@ref openvino_docs_IE_DG_Extensibility_DG_Intro">
                     <tab type="user" title="Extension Library" url="@ref openvino_docs_IE_DG_Extensibility_DG_Extension"/>
                     <tab type="user" title="Custom Operations" url="@ref openvino_docs_IE_DG_Extensibility_DG_AddingNGraphOps"/>
diff --git a/docs/img/caching_enabled.png b/docs/img/caching_enabled.png
new file mode 100644
index 00000000000..f8a898764e1
--- /dev/null
+++ b/docs/img/caching_enabled.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:488a7a47e5086a6868c22219bc9d58a3508059e5a1dc470f2653a12552dea82f
+size 36207
diff --git a/docs/img/caching_times.png b/docs/img/caching_times.png
new file mode 100644
index 00000000000..11d9c8b088f
--- /dev/null
+++ b/docs/img/caching_times.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2eed189f9cb3d30fe13b4ba4515edd4e6da5d01545660e65fa8a33d945967281
+size 28894
diff --git a/docs/snippets/InferenceEngine_Caching0.cpp b/docs/snippets/InferenceEngine_Caching0.cpp
new file mode 100644
index 00000000000..5311a3d0bb6
--- /dev/null
+++ b/docs/snippets/InferenceEngine_Caching0.cpp
@@ -0,0 +1,17 @@
+#include <ie_core.hpp>
+
+int main() {
+using namespace InferenceEngine;
+    std::string modelPath = "/tmp/myModel.xml";
+    std::string device = "GNA";
+    std::map<std::string, std::string> deviceConfig;
+//! [part0]
+    InferenceEngine::Core ie;                                 // Step 1: create Inference engine object
+    ie.SetConfig({{CONFIG_KEY(CACHE_DIR), "myCacheFolder"}}); // Step 1b: Enable caching
+    auto cnnNet = ie.ReadNetwork(modelPath);                  // Step 2: ReadNetwork
+    //...                                                     // Step 3: Prepare inputs/outputs
+    //...                                                     // Step 4: Set device configuration
+    ie.LoadNetwork(cnnNet, device, deviceConfig);             // Step 5: LoadNetwork
+//! [part0]
+return 0;
+}
diff --git a/docs/snippets/InferenceEngine_Caching1.cpp b/docs/snippets/InferenceEngine_Caching1.cpp
new file mode 100644
index 00000000000..3c9d0c5b22d
--- /dev/null
+++ b/docs/snippets/InferenceEngine_Caching1.cpp
@@ -0,0 +1,13 @@
+#include <ie_core.hpp>
+
+int main() {
+using namespace InferenceEngine;
+    std::string modelPath = "/tmp/myModel.xml";
+    std::string device = "GNA";
+    std::map<std::string, std::string> deviceConfig;
+//! [part1]
+    InferenceEngine::Core ie;                                 // Step 1: create Inference engine object
+    ie.LoadNetwork(modelPath, device, deviceConfig);          // Step 2: LoadNetwork by model file path
+//! [part1]
+return 0;
+}
diff --git a/docs/snippets/InferenceEngine_Caching2.cpp b/docs/snippets/InferenceEngine_Caching2.cpp
new file mode 100644
index 00000000000..aaf4b33c10d
--- /dev/null
+++ b/docs/snippets/InferenceEngine_Caching2.cpp
@@ -0,0 +1,14 @@
+#include <ie_core.hpp>
+
+int main() {
+using namespace InferenceEngine;
+    std::string modelPath = "/tmp/myModel.xml";
+    std::string device = "GNA";
+    std::map<std::string, std::string> deviceConfig;
+//! [part2]
+    InferenceEngine::Core ie;                                  // Step 1: create Inference engine object
+    ie.SetConfig({{CONFIG_KEY(CACHE_DIR), "myCacheFolder"}});  // Step 1b: Enable caching
+    ie.LoadNetwork(modelPath, device, deviceConfig);           // Step 2: LoadNetwork by model file path
+//! [part2]
+return 0;
+}
diff --git a/docs/snippets/InferenceEngine_Caching3.cpp b/docs/snippets/InferenceEngine_Caching3.cpp
new file mode 100644
index 00000000000..ce91a798552
--- /dev/null
+++ b/docs/snippets/InferenceEngine_Caching3.cpp
@@ -0,0 +1,20 @@
+#include <ie_core.hpp>
+
+int main() {
+using namespace InferenceEngine;
+    std::string modelPath = "/tmp/myModel.xml";
+    std::string deviceName = "GNA";
+    std::map<std::string, std::string> deviceConfig;
+    InferenceEngine::Core ie;
+//! [part3]
+    // Get list of supported metrics
+    std::vector<std::string> keys = ie.GetMetric(deviceName, METRIC_KEY(SUPPORTED_METRICS));
+
+    // Find 'IMPORT_EXPORT_SUPPORT' metric in supported metrics
+    auto it = std::find(keys.begin(), keys.end(), METRIC_KEY(IMPORT_EXPORT_SUPPORT));
+
+    // If metric 'IMPORT_EXPORT_SUPPORT' exists, check it's value
+    bool cachingSupported = (it != keys.end()) && ie.GetMetric(deviceName, METRIC_KEY(IMPORT_EXPORT_SUPPORT));
+//! [part3]
+    return 0;
+}