Publishing 2019 R3 content

2019-10-04 19:26:43 +03:00
parent ba6e22b1b5
commit 0923303e02
1734 changed files with 72094 additions and 58972 deletions
--- a/inference-engine/samples/CMakeLists.txt
+++ b/inference-engine/samples/CMakeLists.txt
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 #

-cmake_minimum_required (VERSION 2.8.11)
+cmake_minimum_required (VERSION 2.8.12)

 project(Samples)

@@ -150,8 +150,6 @@ macro(ie_add_sample)
        if(NOT OpenCV_FOUND)
            message(WARNING "OPENCV is disabled or not found, " ${IE_SAMPLE_NAME} " skipped")
            return()
-        else()
-            add_definitions(-DUSE_OPENCV)
        endif()
    endif()

@@ -164,6 +162,9 @@ macro(ie_add_sample)

    # Create executable file from sources
    add_executable(${IE_SAMPLE_NAME} ${IE_SAMPLE_SOURCES} ${IE_SAMPLES_HEADERS})
+    if(IE_SAMPLE_OPENCV_DEPENDENCIES)
+        target_compile_definitions(${IE_SAMPLE_NAME} PRIVATE USE_OPENCV)
+    endif()

    if(WIN32)
        set_target_properties(${IE_SAMPLE_NAME} PROPERTIES COMPILE_PDB_NAME ${IE_SAMPLE_NAME})
@@ -176,7 +177,6 @@ macro(ie_add_sample)

    target_link_libraries(${IE_SAMPLE_NAME} PRIVATE ${OpenCV_LIBRARIES} ${InferenceEngine_LIBRARIES}
                                                    ${IE_SAMPLE_DEPENDENCIES} IE::ie_cpu_extension gflags)
-
    if(UNIX)
        target_link_libraries(${IE_SAMPLE_NAME} PRIVATE pthread)
    endif()
@@ -195,12 +195,12 @@ endmacro()

 # use this flag if you need to throw custom message in case if the IE package is not found.
 if (IE_NOT_FOUND_MESSAGE)
-    find_package(InferenceEngine 2.0 QUIET)
+    find_package(InferenceEngine 2.1 QUIET)
    if (NOT(InferenceEngine_FOUND))
        message(FATAL_ERROR ${IE_NOT_FOUND_MESSAGE})
    endif()
 else()
-    find_package(InferenceEngine 2.0 REQUIRED)
+    find_package(InferenceEngine 2.1 REQUIRED)
 endif()

 # collect all samples subdirectories
--- a/inference-engine/samples/benchmark_app/README.md
+++ b/inference-engine/samples/benchmark_app/README.md
@@ -1,21 +1,18 @@
-# Benchmark C++ Application
+# Benchmark C++ Tool

-This topic demonstrates how to use the Benchmark Application to estimate deep learning inference performance on
-supported devices. Performance can be measured for two inference modes: synchronous (latency-oriented) and asynchronous (throughput-oriented).
+This topic demonstrates how to use the Benchmark C++ Tool to estimate deep learning inference performance on supported devices. Performance can be measured for two inference modes: synchronous (latency-oriented) and asynchronous (throughput-oriented).

-> **NOTE:** This topic describes usage of C++ implementation of the Benchmark Application. For the Python* implementation, refer to [Benchmark Application (Python*)](./inference-engine/ie_bridges/python/sample/benchmark_app/README.md).
+> **NOTE:** This topic describes usage of C++ implementation of the Benchmark Tool. For the Python* implementation, refer to [Benchmark Python* Tool](./inference-engine/tools/benchmark_tool/README.md).


 ## How It Works

-Upon start-up, the application reads command-line parameters and loads a network and images/binary files to the Inference Engine
-plugin, which is chosen depending on a specified device. The number of infer requests and execution approach depend
-on the mode defined with the `-api` command-line parameter.
+Upon start-up, the application reads command-line parameters and loads a network and images/binary files to the Inference Engine plugin, which is chosen depending on a specified device. The number of infer requests and execution approach depend on the mode defined with the `-api` command-line parameter.

-> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Reverse Input Channels** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
+> **NOTE**: By default, Inference Engine samples, tools and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Reverse Input Channels** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).

 If you run the application in the synchronous mode, it creates one infer request and executes the `Infer` method.
-If you run the application in the asynchronous mode, it creates as many infer requests as specified in the `-nireq` command-line parameter and executes the `StartAsync` method for each of them. If `-nireq` is not set, the demo will use the default value for specified device.
+If you run the application in the asynchronous mode, it creates as many infer requests as specified in the `-nireq` command-line parameter and executes the `StartAsync` method for each of them. If `-nireq` is not set, the application will use the default value for specified device.

 A number of execution steps is defined by one of the following parameters:
 * Number of iterations specified with the `-niter` command-line argument
@@ -45,14 +42,19 @@ The application also saves executable graph information serialized to a XML file
 `-exec_graph_path` parameter.


-## Running
+## Run the Tool
 Notice that the benchmark_app usually produces optimal performance for any device out of the box.

-**So in most cases you don't need to play the app options explicitly and the plain device name is enough**, e.g.:
+**So in most cases you don't need to play the app options explicitly and the plain device name is enough**, for example, for CPU:
+```sh
+./benchmark_app -m <model> -i <input> -d CPU
 ```
-$benchmark_app -m <model> -i <input> -d CPU
-```
-As explained in the  [Introduction to Performance Topics](./docs/IE_DG/Intro_to_Performance.md) section, it is preferable to use the FP16 IR for the model.
+
+But it is still may be non-optimal for some cases, especially for very small networks. More details can read in [Introduction to Performance Topics](./docs/IE_DG/Intro_to_Performance.md).
+
+As explained in the  [Introduction to Performance Topics](./docs/IE_DG/Intro_to_Performance.md) section, for all devices, including new [MULTI device](./docs/IE_DG/supported_plugins/MULTI.md) it is preferable to use the FP16 IR for the model.
+Also if latency of the CPU inference on the multi-socket machines is of concern, please refer to the same
+[Introduction to Performance Topics](./docs/IE_DG/Intro_to_Performance.md) document.

 Running the application with the `-h` option yields the following usage message:
 ```
@@ -70,6 +72,7 @@ Options:
    -m "<path>"               Required. Path to an .xml file with a trained model.
    -d "<device>"             Optional. Specify a target device to infer on (the list of available devices is shown below). Default value is CPU.
                              Use "-d HETERO:<comma-separated_devices_list>" format to specify HETERO plugin.
+                              Use "-d MULTI:<comma-separated_devices_list>" format to specify MULTI plugin. 
    The application looks for a suitable plugin for the specified device.
    -l "<absolute_path>"      Required for CPU custom layers. Absolute path to a shared library with the kernels implementations.
          Or
@@ -84,8 +87,11 @@ Options:

  CPU-specific performance options:
    -nstreams "<integer>"     Optional. Number of streams to use for inference on the CPU or/and GPU in throughput mode
-                              (for HETERO device case use format <device1>:<nstreams1>,<device2>:<nstreams2> or just <nstreams>).
-    -nthreads "<integer>"     Optional. Number of threads to use for inference on the CPU (including HETERO case).
+                              (for HETERO and MULTI device cases use format <device1>:<nstreams1>,<device2>:<nstreams2> or just <nstreams>).
+                              Default value is determined automatically for a device. 
+                              Please note that although the automatic selection usually provides a reasonable performance, 
+                              it still may be non-optimal for some cases, especially for very small networks.
+    -nthreads "<integer>"     Optional. Number of threads to use for inference on the CPU (including HETERO and MULTI cases).
    -pin "YES"/"NO"           Optional. Enable ("YES" is default value) or disable ("NO") CPU threads pinning for CPU-involved inference.

  Statistics dumping options:
@@ -102,48 +108,74 @@ If a model has only image input(s), please a provide folder with images or a pat
 If a model has some specific input(s) (not images), please prepare a binary file(s), which is filled with data of appropriate precision and provide a path to them as input.
 If a model has mixed input types, input folder should contain all required files. Image inputs are filled with image files one by one. Binary inputs are filled with binary inputs one by one.

-To download the pre-trained models, use the OpenVINO [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) or go to [https://download.01.org/opencv/](https://download.01.org/opencv/).
+To run the tool, you can use public or Intel's pre-trained models. To download the models, use the OpenVINO [Model Downloader](./tools/downloader/README.md) or go to [https://download.01.org/opencv/](https://download.01.org/opencv/).

-> **NOTE**: Before running the demo with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md).
+> **NOTE**: Before running the tool with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md).

-For example, to perform inference on CPU in the synchronous mode and get estimated performance metrics for AlexNet model,
-run the following command:
+## Examples of Running the Tool

-```sh
-./benchmark_app -i <path_to_image>/inputImage.bmp -m <path_to_model>/alexnet_fp32.xml -d CPU -api sync
-```
+This section provides step-by-step instructions on how to run the Benchmark Tool with the `googlenet-v1` public model on CPU or FPGA devices. As an input, the `car.png` file from the `<INSTALL_DIR>/deployment_tools/demo/` directory is used.  

-For the asynchronous mode:
-```sh
-./benchmark_app -i <path_to_image>/inputImage.bmp -m <path_to_model>/alexnet_fp32.xml -d CPU -api async
-```
+> **NOTE:** The Internet access is required to execute the following steps successfully. If you have access to the Internet through the proxy server only, please make sure that it is configured in your OS environment.

-## Demo Output
+1. Download the model. Go to the the Model Downloader directory and run the `downloader.py` script with specifying the model name and directory to download the model to:
+   ```sh
+   cd <INSTAL_DIR>/deployment_tools/open_model_zoo/tools/downloader
+   ```
+   ```sh
+   python3 downloader.py --name googlenet-v1 -o <models_dir>
+   ```
+2. Convert the model to the Inference Engine IR format. Go to the Model Optimizer directory and run the `mo.py` script with specifying the path to the model, model format (which must be FP32 for CPU and FPG) and output directory to generate the IR files:
+   ```sh
+   cd <INSTALL_DIR>/deployment_tools/model_optimizer
+   ```
+   ```sh
+   python3 mo.py --input_model <models_dir>/public/googlenet-v1/googlenet-v1.caffemodel --data_type FP32 --output_dir <ir_dir>
+   ```     
+3. Run the tool with specifying the `<INSTALL_DIR>/deployment_tools/demo/car.png` file as an input image, the IR of the `googlenet-v1` model and a device to perform inference on. The following commands demonstrate running the Benchmark Tool in the asynchronous mode on CPU and FPGA devices:
+   
+   * On CPU:
+   ```sh
+   ./benchmark_app -m <ir_dir>/googlenet-v1.xml -d CPU -api async -i <INSTALL_DIR>/deployment_tools/demo/car.png --progress true
+   ```
+   * On FPGA:
+   ```sh
+   ./benchmark_app -m <ir_dir>/googlenet-v1.xml -d HETERO:FPGA,CPU -api async -i <INSTALL_DIR>/deployment_tools/demo/car.png --progress true
+   ```

 The application outputs the number of executed iterations, total duration of execution, latency and throughput.
-Additionally, if you set the `-report_type` parameter, the application outputs statistics report.
-If you set the `-pc` parameter, the application outputs performance counters.
-If you set `-exec_graph_path`, the application reports executable graph information serialized.
+Additionally, if you set the `-report_type` parameter, the application outputs statistics report. If you set the `-pc` parameter, the application outputs performance counters. If you set `-exec_graph_path`, the application reports executable graph information serialized. All measurements including per-layer PM counters are reported in milliseconds.

-```
-[Step 8/9] Measuring performance (Start inference asyncronously, 60000 ms duration, 4 inference requests in parallel using 4 streams)
-Progress: [....................] 100.00% done
+Below are fragments of sample output for CPU and FPGA devices: 

-[Step 9/9] Dumping statistics report
-[ INFO ] Statistics collecting was not requested. No reports are dumped.
-Progress: [....................] 100.00% done
+* For CPU:
+   ```
+   [Step 8/9] Measuring performance (Start inference asyncronously, 60000 ms duration, 4 inference requests in parallel using 4 streams)
+   Progress: [....................] 100.00% done

-Count:      4612 iterations
-Duration:   60110.04 ms
-Latency:    50.99 ms
-Throughput: 76.73 FPS
+   [Step 9/9] Dumping statistics report
+   [ INFO ] Statistics collecting was not requested. No reports are dumped.
+   Progress: [....................] 100.00% done

-```
+   Count:      4612 iterations
+   Duration:   60110.04 ms
+   Latency:    50.99 ms
+   Throughput: 76.73 FPS
+   ```

-All measurements including per-layer PM counters are reported in milliseconds.
+* For FPGA:
+   ```
+   [Step 10/11] Measuring performance (Start inference asynchronously, 5 inference requests using 4 streams for CPU, limits: 120000 ms duration)
+   Progress: [....................] 100% done

+   [Step 11/11] Dumping statistics report
+   Count:      102515 iterations
+   Duration:   120007.38 ms
+   Latency:    5.84 ms
+   Throughput: 854.24 FP
+   ```

 ## See Also
 * [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md)
 * [Model Optimizer](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md)
-* [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader)
+* [Model Downloader](./tools/downloader/README.md)
--- a/inference-engine/samples/benchmark_app/benchmark_app.hpp
+++ b/inference-engine/samples/benchmark_app/benchmark_app.hpp
@@ -23,7 +23,9 @@ static const char api_message[] = "Optional. Enable Sync/Async API. Default valu

 /// @brief message for assigning cnn calculation to device
 static const char target_device_message[] = "Optional. Specify a target device to infer on (the list of available devices is shown below). " \
-"Default value is CPU. Use \"-d HETERO:<comma-separated_devices_list>\" format to specify HETERO plugin. ";
+"Default value is CPU. Use \"-d HETERO:<comma-separated_devices_list>\" format to specify HETERO plugin. " \
+"Use \"-d MULTI:<comma-separated_devices_list>\" format to specify MULTI plugin. " \
+"The application looks for a suitable plugin for the specified device.";

 /// @brief message for iterations count
 static const char iterations_count_message[] = "Optional. Number of iterations. " \
@@ -37,11 +39,14 @@ static const char execution_time_message[] = "Optional. Time in seconds to execu

 /// @brief message for #threads for CPU inference
 static const char infer_num_threads_message[] = "Optional. Number of threads to use for inference on the CPU "
-                                                "(including HETERO case).";
+                                                "(including HETERO and MULTI cases).";

 /// @brief message for #streams for CPU inference
 static const char infer_num_streams_message[] = "Optional. Number of streams to use for inference on the CPU or/and GPU in throughput mode "
-                                                "(for HETERO device case use format <device1>:<nstreams1>,<device2>:<nstreams2> or just <nstreams>)";
+                                                "(for HETERO and MULTI device cases use format <dev1>:<nstreams1>,<dev2>:<nstreams2> or just <nstreams>). "
+                                                "Default value is determined automatically for a device.Please note that although the automatic selection "
+                                                "usually provides a reasonable performance, it still may be non - optimal for some cases, especially for "
+                                                "very small networks. See sample's README for more details.";

 /// @brief message for user library argument
 static const char custom_cpu_library_message[] = "Required for CPU custom layers. Absolute path to a shared library with the kernels implementations.";
--- a/inference-engine/samples/benchmark_app/main.cpp
+++ b/inference-engine/samples/benchmark_app/main.cpp
@@ -62,6 +62,10 @@ bool ParseAndCheckCommandLine(int argc, char *argv[]) {
        throw std::logic_error(err);
    }

+    if ((FLAGS_report_type == averageCntReport) && ((FLAGS_d.find("MULTI") != std::string::npos))) {
+        throw std::logic_error("only " + std::string(detailedCntReport) + " report type is supported for MULTI device");
+    }
+
    return true;
 }

@@ -89,10 +93,20 @@ static void next_step(const std::string additional_info = "") {
              << (additional_info.empty() ? "" : " (" + additional_info + ")") << std::endl;
 }

+template <typename T>
+T getMedianValue(const std::vector<T> &vec) {
+    std::vector<T> sortedVec(vec);
+    std::sort(sortedVec.begin(), sortedVec.end());
+    return (sortedVec.size() % 2 != 0) ?
+           sortedVec[sortedVec.size() / 2ULL] :
+           (sortedVec[sortedVec.size() / 2ULL] + sortedVec[sortedVec.size() / 2ULL - 1ULL]) / static_cast<T>(2.0);
+}
+
 /**
 * @brief The entry point of the benchmark application
 */
 int main(int argc, char *argv[]) {
+    std::shared_ptr<StatisticsReport> statistics;
    try {
        // ----------------- 1. Parsing and validating input arguments -------------------------------------------------
        next_step();
@@ -101,10 +115,30 @@ int main(int argc, char *argv[]) {
            return 0;
        }

+        if (!FLAGS_report_type.empty()) {
+            std::vector<gflags::CommandLineFlagInfo> flags;
+            StatisticsReport::Parameters command_line_arguments;
+            gflags::GetAllFlags(&flags);
+
+            for (auto &flag : flags) {
+                if (!flag.is_default) {
+                    command_line_arguments.push_back({ flag.name, flag.current_value });
+                }
+            }
+            statistics = std::make_shared<StatisticsReport>(StatisticsReport::Config{FLAGS_report_type, FLAGS_report_folder});
+            statistics->addParameters(StatisticsReport::Category::COMMAND_LINE_PARAMETERS, command_line_arguments);
+        }
+
        /** This vector stores paths to the processed images **/
        std::vector<std::string> inputFiles;
        parseInputFilesArguments(inputFiles);

+        if (FLAGS_nstreams.empty()) {
+            slog::warn << "-nstreams default value is determined automatically for a device. "
+                "Although the automatic selection usually provides a reasonable performance,"
+                "but it still may be non-optimal for some cases, for more information look at README." << slog::endl<< slog::endl;
+        }
+
        // ----------------- 2. Loading the Inference Engine -----------------------------------------------------------
        next_step();

@@ -141,9 +175,25 @@ int main(int argc, char *argv[]) {
        slog::info << "Loading network files" << slog::endl;

        CNNNetReader netBuilder;
+        auto startTime = Time::now();
        netBuilder.ReadNetwork(FLAGS_m);
        const std::string binFileName = fileNameNoExt(FLAGS_m) + ".bin";
        netBuilder.ReadWeights(binFileName);
+        auto float_to_string = [] (const float number) {
+            std::stringstream ss;
+            ss << std::fixed << std::setprecision(2) << number;
+            return ss.str();
+        };
+        auto get_total_ms_time = [ &startTime ] () {
+            return std::chrono::duration_cast<ns>(Time::now() - startTime).count() * 0.000001;
+        };
+        auto duration_ms = float_to_string(get_total_ms_time());
+        slog::info << "Read network took " << duration_ms << " ms" << slog::endl;
+        if (statistics)
+            statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+                                      {
+                                          {"read network time (ms)", duration_ms}
+                                      });

        CNNNetwork cnnNetwork = netBuilder.getNetwork();
        const InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
@@ -180,8 +230,9 @@ int main(int argc, char *argv[]) {
        }

        const size_t batchSize = cnnNetwork.getBatchSize();
+        const Precision precision = cnnNetwork.getPrecision();
        slog::info << (FLAGS_b != 0 ? "Network batch size was changed to: " : "Network batch size: ") << batchSize <<
-            ", precision: " << cnnNetwork.getPrecision() << slog::endl;
+            ", precision: " << precision << slog::endl;

        // ----------------- 5. Configuring input ----------------------------------------------------------------------
        next_step();
@@ -198,7 +249,8 @@ int main(int argc, char *argv[]) {

        bool perf_counts = (FLAGS_report_type == detailedCntReport ||
                            FLAGS_report_type == averageCntReport ||
-                            FLAGS_pc);
+                            FLAGS_pc ||
+                            !FLAGS_exec_graph_path.empty());

        auto devices = parseDevices(device_name);
        std::map<std::string, uint32_t> device_nstreams = parseValuePerDevice(devices, FLAGS_nstreams);
@@ -208,8 +260,13 @@ int main(int argc, char *argv[]) {
                if (FLAGS_nthreads != 0)
                    ie.SetConfig({{ CONFIG_KEY(CPU_THREADS_NUM), std::to_string(FLAGS_nthreads) }}, device);

-                // pin threads for CPU portion of inference
-                ie.SetConfig({{ CONFIG_KEY(CPU_BIND_THREAD), FLAGS_pin }}, device);
+                if ((device_name.find("MULTI") != std::string::npos) &&
+                    (device_name.find("GPU") != std::string::npos)) {
+                    ie.SetConfig({{ CONFIG_KEY(CPU_BIND_THREAD), CONFIG_VALUE(NO) }}, device);
+                } else {
+                    // pin threads for CPU portion of inference
+                    ie.SetConfig({{ CONFIG_KEY(CPU_BIND_THREAD), FLAGS_pin }}, device);
+                }

                // for CPU execution, more throughput-oriented execution via streams
                if (FLAGS_api == "async")
@@ -223,6 +280,13 @@ int main(int argc, char *argv[]) {
                                    (device_nstreams.count(device) > 0 ? std::to_string(device_nstreams.at(device)) :
                                                                         "GPU_THROUGHPUT_AUTO") }}, device);
                device_nstreams[device] = std::stoi(ie.GetConfig(device, CONFIG_KEY(GPU_THROUGHPUT_STREAMS)).as<std::string>());
+
+                if ((device_name.find("MULTI") != std::string::npos) &&
+                    (device_name.find("CPU") != std::string::npos)) {
+                    // multi-device execution with the CPU + GPU performs best with GPU trottling hint,
+                    // which releases another CPU thread (that is otherwise used by the GPU driver for active polling)
+                    ie.SetConfig({{ CLDNN_CONFIG_KEY(PLUGIN_THROTTLE), "1" }}, "GPU");
+                }
            } else if (device == "MYRIAD") {
                ie.SetConfig({{ CONFIG_KEY(LOG_LEVEL), CONFIG_VALUE(LOG_NONE) },
                              { VPU_CONFIG_KEY(LOG_LEVEL), CONFIG_VALUE(LOG_WARNING) }}, device);
@@ -234,7 +298,15 @@ int main(int argc, char *argv[]) {

        std::map<std::string, std::string> config = {{ CONFIG_KEY(PERF_COUNT), perf_counts ? CONFIG_VALUE(YES) :
                                                                                             CONFIG_VALUE(NO) }};
+        startTime = Time::now();
        ExecutableNetwork exeNetwork = ie.LoadNetwork(cnnNetwork, device_name, config);
+        duration_ms = float_to_string(get_total_ms_time());
+        slog::info << "Load network took " << duration_ms << " ms" << slog::endl;
+        if (statistics)
+            statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+                                      {
+                                          {"load network time (ms)", duration_ms}
+                                      });

        // ----------------- 8. Setting optimal runtime parameters -----------------------------------------------------
        next_step();
@@ -274,6 +346,28 @@ int main(int argc, char *argv[]) {
        }
        uint64_t duration_nanoseconds = getDurationInNanoseconds(duration_seconds);

+        if (statistics) {
+            statistics->addParameters(StatisticsReport::Category::RUNTIME_CONFIG,
+                                      {
+                                            {"topology", cnnNetwork.getName()},
+                                            {"target device", device_name},
+                                            {"API", FLAGS_api},
+                                            {"precision", std::string(precision.name())},
+                                            {"batch size", std::to_string(batchSize)},
+                                            {"number of iterations", std::to_string(niter)},
+                                            {"number of parallel infer requests", std::to_string(nireq)},
+                                            {"duration (ms)", std::to_string(getDurationInMilliseconds(duration_seconds))},
+                                      });
+            for (auto& nstreams : device_nstreams) {
+                std::stringstream ss;
+                ss << "number of " << nstreams.first << " streams";
+                statistics->addParameters(StatisticsReport::Category::RUNTIME_CONFIG,
+                                          {
+                                                {ss.str(), std::to_string(nstreams.second)},
+                                          });
+            }
+        }
+
        // ----------------- 9. Creating infer requests and filling input blobs ----------------------------------------
        next_step();

@@ -333,7 +427,7 @@ int main(int argc, char *argv[]) {
        inferRequestsQueue.waitAll();
        inferRequestsQueue.resetTimes();

-        const auto startTime = Time::now();
+        startTime = Time::now();
        auto execTime = std::chrono::duration_cast<ns>(Time::now() - startTime).count();

        /** Start inference & calculate performance **/
@@ -373,35 +467,34 @@ int main(int argc, char *argv[]) {
        // wait the latest inference executions
        inferRequestsQueue.waitAll();

-        StatisticsReport statistics({ FLAGS_d,
-                                      FLAGS_api,
-                                      batchSize,
-                                      nireq,
-                                      niter,
-                                      getDurationInMilliseconds(duration_seconds),
-                                      FLAGS_nthreads,
-                                      device_nstreams,
-                                      FLAGS_pin,
-                                      FLAGS_report_type,
-                                      FLAGS_report_folder
-                                    });
-        if (perf_counts) {
-            for (auto& request : inferRequestsQueue.requests) {
-                statistics.addPerfCounts(request->getPerformanceCounts());
-            }
-        }
-        statistics.addLatencies(inferRequestsQueue.getLatencies());
-
+        double latency = getMedianValue<double>(inferRequestsQueue.getLatencies());
        double totalDuration = inferRequestsQueue.getDurationInMilliseconds();
-        double fps = (FLAGS_api == "sync") ? batchSize * 1000.0 / statistics.getMedianLatency() :
+        double fps = (FLAGS_api == "sync") ? batchSize * 1000.0 / latency :
                                             batchSize * 1000.0 * iteration / totalDuration;
+
+        if (statistics) {
+            statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+                                      {
+                                        {"total execution time (ms)", float_to_string(totalDuration)},
+                                        {"total number of iterations", std::to_string(iteration)},
+                                      });
+            if (device_name.find("MULTI") == std::string::npos) {
+                statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+                                          {
+                                            {"latency (ms)", float_to_string(latency)},
+                                          });
+            }
+            statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+                                      {
+                                          {"throughput", float_to_string(fps)}
+                                      });
+        }
+
        progressBar.finish();

        // ----------------- 11. Dumping statistics report -------------------------------------------------------------
        next_step();

-        statistics.dump(fps, iteration, totalDuration);
-
        if (!FLAGS_exec_graph_path.empty()) {
            try {
                CNNNetwork execGraphInfo = exeNetwork.GetExecGraphInfo();
@@ -412,19 +505,40 @@ int main(int argc, char *argv[]) {
            }
        }

-        if (FLAGS_pc) {
+        if (perf_counts) {
+            std::vector<std::map<std::string, InferenceEngine::InferenceEngineProfileInfo>> perfCounts;
            for (size_t ireq = 0; ireq < nireq; ireq++) {
-                slog::info << "Pefrormance counts for " << ireq << "-th infer request:" << slog::endl;
-                printPerformanceCounts(inferRequestsQueue.requests[ireq]->getPerformanceCounts(), std::cout, getFullDeviceName(ie, FLAGS_d), false);
+                auto reqPerfCounts = inferRequestsQueue.requests[ireq]->getPerformanceCounts();
+                if (FLAGS_pc) {
+                    slog::info << "Pefrormance counts for " << ireq << "-th infer request:" << slog::endl;
+                    printPerformanceCounts(reqPerfCounts, std::cout, getFullDeviceName(ie, FLAGS_d), false);
+                }
+                perfCounts.push_back(reqPerfCounts);
+            }
+            if (statistics) {
+                statistics->dumpPerformanceCounters(perfCounts);
            }
        }

+        if (statistics)
+            statistics->dump();
+
        std::cout << "Count:      " << iteration << " iterations" << std::endl;
-        std::cout << "Duration:   " << totalDuration << " ms" << std::endl;
-        std::cout << "Latency:    " << statistics.getMedianLatency() << " ms" << std::endl;
-        std::cout << "Throughput: " << fps << " FPS" << std::endl;
+        std::cout << "Duration:   " << float_to_string(totalDuration) << " ms" << std::endl;
+        if (device_name.find("MULTI") == std::string::npos)
+            std::cout << "Latency:    " << float_to_string(latency) << " ms" << std::endl;
+        std::cout << "Throughput: " << float_to_string(fps) << " FPS" << std::endl;
    } catch (const std::exception& ex) {
        slog::err << ex.what() << slog::endl;
+
+        if (statistics) {
+            statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+                                      {
+                                            {"error", ex.what()},
+                                      });
+            statistics->dump();
+        }
+
        return 3;
    }

--- a/inference-engine/samples/benchmark_app/statistics_report.cpp
+++ b/inference-engine/samples/benchmark_app/statistics_report.cpp
@@ -10,215 +10,127 @@

 #include "statistics_report.hpp"

-void StatisticsReport::addPerfCounts(const std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &pmStat) {
-    if (_config.report_type == averageCntReport || _config.report_type == detailedCntReport) {
-        // collect per-iteration statistics only in case of enabled median/detailed statistic collecting
-        _performanceCounters.push_back(pmStat);
-    }
+void StatisticsReport::addParameters(const Category &category, const Parameters& parameters) {
+    if (_parameters.count(category) == 0)
+        _parameters[category] = parameters;
+    else
+        _parameters[category].insert(_parameters[category].end(), parameters.begin(), parameters.end());
 }

-void StatisticsReport::addLatencies(const std::vector<double> &latencies) {
-    _latencies.insert(_latencies.end(), latencies.begin(), latencies.end());
-}
+void StatisticsReport::dump() {
+    CsvDumper dumper(true, _config.report_folder + _separator + "benchmark_report.csv");

-void StatisticsReport::dump(const double &fps, const size_t &iteration_number, const double &totalExecTime) {
-    if (_config.report_type.empty()) {
-        slog::info << "Statistics collecting was not requested. No reports are dumped." << slog::endl;
-        return;
-    }
-
-    std::string separator =
-#if defined _WIN32 || defined __CYGWIN__
-    #   if defined UNICODE
-        L"\\";
-    #   else
-        "\\";
-    #   endif
-#else
-        "/";
-#endif
-    if (_config.report_folder.empty())
-        separator = "";
-
-    CsvDumper dumper(true, _config.report_folder + separator + "benchmark_" + _config.report_type + "_report.csv");
-
-    // resulting number of columns in csv file depends on the report_type. If it's noCntReport, then
-    // no PM data is collected and there are only 3 columns in the file (in configuration section). If it's
-    // averageCntReport then median PM values are collected per each layer and the number of columns is 6.
-    // Example from GPU:
-    //
-    // layer name;exec status;layer type;exec type;real time;cpu time;
-    // conv1;EXECUTED;Convolution;convolution_gpu_bfyx_gemm_like;615;3;
-    // Here, all the data are taken from InferenceEngine::InferenceEngineProfileInfo.
-    //
-    // In case of detailedCntReport the number of columns is 4 + _config.nireq * 2, because first 4 parameters
-    // are the same but realTime and cpuTime can be different on each iteration (example from 5 GPU requests):
-    // conv1;EXECUTED;Convolution;convolution_gpu_bfyx_gemm_like;630,3;617,3;616,3;615,3;617,3;
-    size_t numOfColumns = 0;
-    if (_config.report_type == noCntReport) {
-        numOfColumns = 3;
-    } else if (_config.report_type == averageCntReport) {
-        numOfColumns = 6;
-    } else {
-        // for detailedCntReport
-        numOfColumns = 4 + _config.nireq * 2;
-    }
-
-    auto completeCsvRow = [](CsvDumper &dumper, size_t numOfColumns, size_t filled) {
-        for (size_t i = 0; i < numOfColumns - filled; i++)
-            dumper << "";
-        dumper.endLine();
-    };
-
-    // dump execution configuration
-    dumper << "Configuration setup";
-    completeCsvRow(dumper, numOfColumns, 1);
-    dumper << "config option" << "CLI parameter" << "value";
-    completeCsvRow(dumper, numOfColumns, 3);
-
-    dumper << "target device" << " -d" << _config.device;
-    completeCsvRow(dumper, numOfColumns, 3);
-    dumper << "execution mode" << " -api" << _config.api;
-    completeCsvRow(dumper, numOfColumns, 3);
-    dumper << "batch size" << " -b" << _config.batch;
-    completeCsvRow(dumper, numOfColumns, 3);
-    dumper << "number of iterations" << " -niter" << _config.niter;
-    completeCsvRow(dumper, numOfColumns, 3);
-    dumper << "number of parallel infer requests" << " -nireq" << _config.nireq;
-    completeCsvRow(dumper, numOfColumns, 3);
-    dumper << "duration in ms" << " -t" << _config.duration;
-    completeCsvRow(dumper, numOfColumns, 3);
-    dumper << "number of CPU threads" << " -nthreads" << _config.cpu_nthreads;
-    completeCsvRow(dumper, numOfColumns, 3);
-    for (auto& item : _config.nstreams)
-        dumper << "number of " << item.first << " streams" << " -nstreams" << item.second;
-    completeCsvRow(dumper, numOfColumns, 3);
-    dumper << "CPU pinning enabled" << " -pin" << _config.cpu_pin;
-    completeCsvRow(dumper, numOfColumns, 3);
-
-    dumper.endLine();
-
-    // write PM data from each iteration
-    if (!_performanceCounters.empty()) {
-        if (_config.report_type != averageCntReport && _config.report_type != detailedCntReport) {
-            throw std::logic_error("PM data can only be collected for average or detailed report types");
-        }
-
-        // this vector is sorted according to network layers execution order.
-        auto performanceMapSorted = preparePmStatistics();
-
-        dumper << "Performance counters";
-        completeCsvRow(dumper, numOfColumns, 1);
-        dumper << "layer name" << "exec status" << "layer type" << "exec type";
-
-        if (_config.report_type == averageCntReport) {
-            dumper << "average real time" << "average cpu time";
-            completeCsvRow(dumper, numOfColumns, 6);
-        } else {
-            // detailedCntReport case
-            for (size_t i = 0; i< _performanceCounters.size(); i++) {
-                dumper << "realTime_req" + std::to_string(i) << "cpuTime_req" + std::to_string(i);
-            }
-            completeCsvRow(dumper, numOfColumns, 4 + _performanceCounters.size() * 2);
-        }
-
-        for (const auto &layer : performanceMapSorted) {
-            dumper << layer.first;  // layer name
-
-            switch (layer.second.status) {
-                case InferenceEngine::InferenceEngineProfileInfo::EXECUTED:
-                    dumper << "EXECUTED";
-                    break;
-                case InferenceEngine::InferenceEngineProfileInfo::NOT_RUN:
-                    dumper << "NOT_RUN";
-                    break;
-                case InferenceEngine::InferenceEngineProfileInfo::OPTIMIZED_OUT:
-                    dumper << "OPTIMIZED_OUT";
-                    break;
-            }
-            dumper << layer.second.layer_type << layer.second.exec_type;
-
-            if (_config.report_type == averageCntReport) {
-                // write average realTime and cpuTime from each processed request for current layer
-                dumper <<
-                std::to_string(std::accumulate(_perLayerRealTime[layer.first].begin(),
-                                               _perLayerRealTime[layer.first].end(), 0.0) / _perLayerRealTime[layer.first].size() / 1000.0) <<
-                std::to_string(std::accumulate(_perLayerCpuTime[layer.first].begin(),
-                                               _perLayerCpuTime[layer.first].end(), 0.0) / _perLayerCpuTime[layer.first].size()  / 1000.0);
-            } else {
-                // write all realTime and cpuTime from each processed request for current layer
-                for (size_t i = 0; i < _config.nireq; i++) {
-                    dumper << std::to_string(_perLayerRealTime[layer.first][i] / 1000.0) << std::to_string(_perLayerCpuTime[layer.first][i] / 1000.0);
-                }
-            }
+    auto dump_parameters = [ &dumper ] (const Parameters &parameters) {
+        for (auto& parameter : parameters) {
+            dumper << parameter.first << parameter.second;
            dumper.endLine();
        }
+    };
+    if (_parameters.count(Category::COMMAND_LINE_PARAMETERS)) {
+        dumper << "Command line parameters";
+        dumper.endLine();
+
+        dump_parameters(_parameters.at(Category::COMMAND_LINE_PARAMETERS));
        dumper.endLine();
    }

+    if (_parameters.count(Category::RUNTIME_CONFIG)) {
+        dumper << "Configuration setup";
+        dumper.endLine();
+
+        dump_parameters(_parameters.at(Category::RUNTIME_CONFIG));
+        dumper.endLine();
+    }
+
+    if (_parameters.count(Category::EXECUTION_RESULTS)) {
+        dumper << "Execution results";
+        dumper.endLine();
+
+        dump_parameters(_parameters.at(Category::EXECUTION_RESULTS));
+        dumper.endLine();
+    }
+
+    slog::info << "Statistics report is stored to " << dumper.getFilename() << slog::endl;
+}
+
+void StatisticsReport::dumpPerformanceCountersRequest(CsvDumper& dumper,
+                                                      const PerformaceCounters& perfCounts) {
+    auto performanceMapSorted = perfCountersSorted(perfCounts);
+
+    long long total = 0L;
+    long long total_cpu = 0L;
+
+    dumper << "layerName" << "execStatus" << "layerType" << "execType";
+    dumper << "realTime (ms)" << "cpuTime (ms)";
+    dumper.endLine();
+
+    for (const auto &layer : performanceMapSorted) {
+        dumper << layer.first;  // layer name
+
+        switch (layer.second.status) {
+            case InferenceEngine::InferenceEngineProfileInfo::EXECUTED:
+                dumper << "EXECUTED";
+                break;
+            case InferenceEngine::InferenceEngineProfileInfo::NOT_RUN:
+                dumper << "NOT_RUN";
+                break;
+            case InferenceEngine::InferenceEngineProfileInfo::OPTIMIZED_OUT:
+                dumper << "OPTIMIZED_OUT";
+                break;
+        }
+        dumper << layer.second.layer_type << layer.second.exec_type;
+        dumper << std::to_string(layer.second.realTime_uSec / 1000.0) << std::to_string(layer.second.cpu_uSec/ 1000.0);
+        total += layer.second.realTime_uSec;
+        total_cpu += layer.second.cpu_uSec;
+        dumper.endLine();
+    }
+    dumper << "Total" << "" << "" << "";
+    dumper <<  total / 1000.0 << total_cpu / 1000.0;
+    dumper.endLine();
+    dumper.endLine();
+}
+
+void StatisticsReport::dumpPerformanceCounters(const std::vector<PerformaceCounters> &perfCounts) {
+    if ((_config.report_type.empty()) || (_config.report_type == noCntReport)) {
+        slog::info << "Statistics collecting for performance counters was not requested. No reports are dumped." << slog::endl;
+        return;
+    }
+    if (perfCounts.empty()) {
+        slog::info << "Peformance counters are empty. No reports are dumped." << slog::endl;
+        return;
+    }
+    CsvDumper dumper(true, _config.report_folder + _separator + "benchmark_" + _config.report_type + "_report.csv");
    if (_config.report_type == detailedCntReport) {
-        dumper << "Statistics";
-        completeCsvRow(dumper, numOfColumns, 1);
-
-        dumper << "metric";
-        for (size_t i = 0; i < _totalLayersTime.size(); i++) {
-            // detailedCntReport case
-            dumper << "req" + std::to_string(i);
+        for (auto& pc : perfCounts) {
+            dumpPerformanceCountersRequest(dumper, pc);
        }
-        completeCsvRow(dumper, numOfColumns, 4 + _totalLayersTime.size());
-        dumper << "latencies";
-        for (const auto &lat : _totalLayersTime) {
-            dumper << lat / 1000.0;
-        }
-        completeCsvRow(dumper, numOfColumns, _totalLayersTime.size());
-        dumper.endLine();
+    } else if (_config.report_type == averageCntReport) {
+        auto getAveragePerformanceCounters = [ &perfCounts ] () {
+            std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> performanceCountersAvg;
+            // sort PM data of first processed request according to layers execution order
+            auto performanceMapSorted = perfCountersSorted(perfCounts[0]);
+
+            // iterate over each processed infer request and handle its PM data
+            for (size_t i = 0; i < perfCounts.size(); i++) {
+                // iterate over each layer from sorted vector and add required PM data to the per-layer maps
+                for (const auto& pm : performanceMapSorted) {
+                    if (performanceCountersAvg.count(pm.first) == 0) {
+                        performanceCountersAvg[pm.first] = perfCounts.at(i).at(pm.first);
+                    } else {
+                        performanceCountersAvg[pm.first].realTime_uSec += perfCounts.at(i).at(pm.first).realTime_uSec;
+                        performanceCountersAvg[pm.first].cpu_uSec += perfCounts.at(i).at(pm.first).cpu_uSec;
+                    }
+                }
+            }
+            for (auto& pm : performanceCountersAvg) {
+                pm.second.realTime_uSec /= perfCounts.size();
+                pm.second.cpu_uSec /= perfCounts.size();
+            }
+            return performanceCountersAvg;
+        };
+        dumpPerformanceCountersRequest(dumper, getAveragePerformanceCounters());
+    } else {
+        throw std::logic_error("PM data can only be collected for average or detailed report types");
    }
-
-    dumper << "Execution results";
-    completeCsvRow(dumper, numOfColumns, 1);
-    dumper << "number of iterations" << iteration_number;
-    completeCsvRow(dumper, numOfColumns, 2);
-    dumper << "latency" << getMedianValue<double>(_latencies);
-    completeCsvRow(dumper, numOfColumns, 2);
-    dumper << "throughput" << fps;
-    completeCsvRow(dumper, numOfColumns, 2);
-    dumper << "total execution time" << totalExecTime;
-    completeCsvRow(dumper, numOfColumns, 2);
-
-    slog::info << "statistics report is stored to " << dumper.getFilename() << slog::endl;
-}
-
-double StatisticsReport::getMedianLatency() {
-    return getMedianValue<double>(_latencies);
-}
-
-std::vector<std::pair<std::string, InferenceEngine::InferenceEngineProfileInfo>> StatisticsReport::preparePmStatistics() {
-    if (_performanceCounters.empty()) {
-        throw std::logic_error("preparePmStatistics() was called when no PM data was collected");
-    }
-
-    // sort PM data of first processed request according to layers execution order
-    auto performanceMapSorted = perfCountersSorted(_performanceCounters[0]);
-
-    // iterate over each processed infer request and handle its PM data
-    for (auto &pm : _performanceCounters) {
-        long long total = 0L;
-        // iterate over each layer from sorted vector and add required PM data to the per-layer maps
-        for (const auto & it : performanceMapSorted) {
-            _perLayerRealTime[it.first].push_back(pm[it.first].realTime_uSec);
-            _perLayerCpuTime[it.first].push_back(pm[it.first].cpu_uSec);
-            total += pm[it.first].realTime_uSec;
-        }
-        _totalLayersTime.push_back(total);
-    }
-    return performanceMapSorted;
-}
-
-template <typename T>
-T StatisticsReport::getMedianValue(const std::vector<T> &vec) {
-    std::vector<T> sortedVec(vec);
-    std::sort(sortedVec.begin(), sortedVec.end());
-    return (sortedVec.size() % 2 != 0) ?
-           sortedVec[sortedVec.size() / 2ULL] :
-           (sortedVec[sortedVec.size() / 2ULL] + sortedVec[sortedVec.size() / 2ULL - 1ULL]) / static_cast<T>(2.0);
+    slog::info << "Pefromance counters report is stored to " << dumper.getFilename() << slog::endl;
 }
--- a/inference-engine/samples/benchmark_app/statistics_report.hpp
+++ b/inference-engine/samples/benchmark_app/statistics_report.hpp
@@ -22,51 +22,51 @@ static constexpr char detailedCntReport[] = "detailed_counters";
 /// @brief Responsible for collecting of statistics and dumping to .csv file
 class StatisticsReport {
 public:
+    typedef std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> PerformaceCounters;
+    typedef std::vector<std::pair<std::string, std::string>> Parameters;
+
    struct Config {
-        std::string device;
-        std::string api;
-        size_t batch;
-        size_t nireq;
-        size_t niter;
-        uint64_t duration;
-        size_t cpu_nthreads;
-        std::map<std::string, uint32_t> nstreams;
-        std::string cpu_pin;
        std::string report_type;
        std::string report_folder;
    };

+    enum class Category {
+        COMMAND_LINE_PARAMETERS,
+        RUNTIME_CONFIG,
+        EXECUTION_RESULTS,
+    };
+
    explicit StatisticsReport(Config config) : _config(std::move(config)) {
-        if (_config.nireq > 0) {
-            _performanceCounters.reserve(_config.nireq);
-        }
+        _separator =
+#if defined _WIN32 || defined __CYGWIN__
+    #   if defined UNICODE
+        L"\\";
+    #   else
+        "\\";
+    #   endif
+#else
+        "/";
+#endif
+        if (_config.report_folder.empty())
+            _separator = "";
    }

-    void addPerfCounts(const std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &pmStat);
+    void addParameters(const Category &category, const Parameters& parameters);

-    void addLatencies(const std::vector<double> &latency);
+    void dump();

-    void dump(const double &fps, const size_t &numProcessedReq, const double &totalExecTime);
-
-    double getMedianLatency();
+    void dumpPerformanceCounters(const std::vector<PerformaceCounters> &perfCounts);

 private:
-    std::vector<std::pair<std::string, InferenceEngine::InferenceEngineProfileInfo>> preparePmStatistics();
-
-    template <typename T>
-    T getMedianValue(const std::vector<T> &vec);
-
-    // Contains PM data for each processed infer request
-    std::vector<std::map<std::string, InferenceEngine::InferenceEngineProfileInfo>> _performanceCounters;
-    // Contains latency of each processed infer request
-    std::vector<double> _latencies;
+    void dumpPerformanceCountersRequest(CsvDumper& dumper,
+                                        const PerformaceCounters& perfCounts);

    // configuration of current benchmark execution
    const Config _config;

-    // mapping from network layer to a vector of calculated RealTime values from each processed infer request.
-    std::map<std::string, std::vector<long long>> _perLayerRealTime;
-    // mapping from network layer to a vector of calculated CPU Time values from each processed infer request.
-    std::map<std::string, std::vector<long long>> _perLayerCpuTime;
-    std::vector<long long> _totalLayersTime;
+    // parameters
+    std::map<Category, Parameters> _parameters;
+
+    // csv separator
+    std::string _separator;
 };
--- a/inference-engine/samples/benchmark_app/utils.hpp
+++ b/inference-engine/samples/benchmark_app/utils.hpp
@@ -12,4 +12,3 @@ std::vector<std::string> parseDevices(const std::string& device_string);
 uint32_t deviceDefaultDeviceDurationInSeconds(const std::string& device);
 std::map<std::string, uint32_t> parseValuePerDevice(const std::vector<std::string>& devices,
                                                    const std::string& values_string);
-uint32_t deviceDefaultRequestsNumber(const std::string& device);
--- a/inference-engine/samples/common/format_reader/CMakeLists.txt
+++ b/inference-engine/samples/common/format_reader/CMakeLists.txt
@@ -12,24 +12,21 @@ file (GLOB LIBRARY_HEADERS
        ${CMAKE_CURRENT_SOURCE_DIR}/*.h
        )

-# Find OpenCV components if exist
-find_package(OpenCV COMPONENTS imgcodecs videoio imgproc QUIET)
-if(NOT(OpenCV_FOUND))
-    message(WARNING "OPENCV is disabled or not found, " ${TARGET_NAME} " is built without OPENCV support")
-endif()
-
 # Create named folders for the sources within the .vcproj
 # Empty name lists them directly under the .vcproj
 source_group("src" FILES ${LIBRARY_SRC})
 source_group("include" FILES ${LIBRARY_HEADERS})

-
 # Create library file from sources.
 add_library(${TARGET_NAME} SHARED ${MAIN_SRC} ${LIBRARY_HEADERS})

-if(OpenCV_FOUND)
-	target_link_libraries(${TARGET_NAME} PRIVATE ${OpenCV_LIBRARIES})
-	target_compile_definitions(${TARGET_NAME} PRIVATE USE_OPENCV)
+# Find OpenCV components if exist
+find_package(OpenCV COMPONENTS imgcodecs videoio imgproc QUIET)
+if(NOT OpenCV_FOUND)
+    message(WARNING "OPENCV is disabled or not found, " ${TARGET_NAME} " will be built without OPENCV support")
+else()
+    target_link_libraries(${TARGET_NAME} PRIVATE ${OpenCV_LIBRARIES})
+    target_compile_definitions(${TARGET_NAME} PRIVATE USE_OPENCV)
 endif()

 target_compile_definitions(${TARGET_NAME} PRIVATE IMPLEMENT_FORMAT_READER)
--- a/inference-engine/samples/common/os/windows/w_dirent.h
+++ b/inference-engine/samples/common/os/windows/w_dirent.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //

@@ -6,31 +6,33 @@

 #if defined(_WIN32)

-#ifndef NOMINMAX
-# define NOMINMAX
+#ifndef WIN32_LEAN_AND_MEAN
+# define WIN32_LEAN_AND_MEAN
+# define WIN32_LEAN_AND_MEAN_UNDEF
 #endif

-#include <winsock2.h>
-#include <windows.h>
-#include <stdlib.h>
+#ifndef NOMINMAX
+# define NOMINMAX
+# define NOMINMAX_UNDEF
+#endif

-#else
-
-#include <unistd.h>
-#include <cstdlib>
-#include <string.h>
+#if defined(_M_IX86) && !defined(_X86_) && !defined(_AMD64_)
+# define _X86_
+#endif

+#if defined(_M_X64) && !defined(_X86_) && !defined(_AMD64_)
+# define _AMD64_
 #endif

 #include <string>
-
+#include <windef.h>
+#include <fileapi.h>
+#include <Winbase.h>
 #include <sys/stat.h>

-#if defined(WIN32)
-    // Copied from linux libc sys/stat.h:
-    #define S_ISREG(m) (((m) & S_IFMT) == S_IFREG)
-    #define S_ISDIR(m) (((m) & S_IFMT) == S_IFDIR)
-#endif
+// Copied from linux libc sys/stat.h:
+#define S_ISREG(m) (((m) & S_IFMT) == S_IFREG)
+#define S_ISDIR(m) (((m) & S_IFMT) == S_IFDIR)

 struct dirent {
    char *d_name;
@@ -38,10 +40,9 @@ struct dirent {
    explicit dirent(const wchar_t *wsFilePath) {
        size_t i;
        auto slen = wcslen(wsFilePath);
-        d_name = static_cast<char*>(malloc(slen + 1));
+        d_name = static_cast<char *>(malloc(slen + 1));
        wcstombs_s(&i, d_name, slen + 1, wsFilePath, slen);
    }
-
    ~dirent() {
        free(d_name);
    }
@@ -60,6 +61,11 @@ class DIR {
    }

 public:
+    DIR(const DIR &other) = delete;
+    DIR(DIR &&other) = delete;
+    DIR& operator=(const DIR &other) = delete;
+    DIR& operator=(DIR &&other) = delete;
+
    explicit DIR(const char *dirPath) : next(nullptr) {
        std::string ws = dirPath;
        if (endsWith(ws, "\\"))
@@ -72,6 +78,7 @@ public:

    ~DIR() {
        if (!next) delete next;
+        next = nullptr;
        FindClose(hFind);
    }

@@ -96,7 +103,7 @@ public:
 };


-static DIR *opendir(const char* dirPath) {
+static DIR* opendir(const char *dirPath) {
    auto dp = new DIR(dirPath);
    if (!dp->isValid()) {
        delete dp;
@@ -105,10 +112,27 @@ static DIR *opendir(const char* dirPath) {
    return dp;
 }

-static struct dirent *readdir(DIR *dp) {
+static struct dirent* readdir(DIR *dp) {
    return dp->nextEnt();
 }

 static void closedir(DIR *dp) {
    delete dp;
 }
+
+#ifdef WIN32_LEAN_AND_MEAN_UNDEF
+# undef WIN32_LEAN_AND_MEAN
+# undef WIN32_LEAN_AND_MEAN_UNDEF
+#endif
+
+#ifdef NOMINMAX_UNDEF
+# undef NOMINMAX_UNDEF
+# undef NOMINMAX
+#endif
+
+#else
+
+#include <sys/types.h>
+#include <dirent.h>
+
+#endif
--- a/inference-engine/samples/common/samples/common.hpp
+++ b/inference-engine/samples/common/samples/common.hpp
@@ -27,7 +27,7 @@
 #include <ie_blob.h>

 #ifndef UNUSED
-  #ifdef WIN32
+  #if defined (_MSC_VER) && !defined (__clang__)
    #define UNUSED
  #else
    #define UNUSED  __attribute__((unused))
@@ -1120,5 +1120,4 @@ inline void showAvailableDevices() {
    for (const auto& device : devices) {
        std::cout << "  " << device;
    }
-    std::cout << "  HDDL" << std::endl;
 }
--- a/inference-engine/samples/common/samples/console_progress.hpp
+++ b/inference-engine/samples/common/samples/console_progress.hpp
@@ -4,7 +4,8 @@

 #pragma once

-#include <iostream>
+#include <cstdio>
+#include <sstream>
 #include <iomanip>

 /**
@@ -12,12 +13,15 @@
 * @brief A ConsoleProgress class provides functionality for printing progress dynamics
 */
 class ConsoleProgress {
-    static const int DEFAULT_DETALIZATION = 20;
+    static const size_t DEFAULT_DETALIZATION = 20;
+    static const size_t DEFAULT_PERCENT_TO_UPDATE_PROGRESS = 1;

    size_t total;
-    size_t current = 0;
+    size_t cur_progress = 0;
+    size_t prev_progress = 0;
    bool stream_output;
    size_t detalization;
+    size_t percent_to_update;

 public:
    /**
@@ -25,18 +29,19 @@ public:
    * @param _total - maximum value that is correspondent to 100%
    * @param _detalization - number of symbols(.) to use to represent progress
    */
-    explicit ConsoleProgress(size_t _total, bool _stream_output = false, size_t _detalization = DEFAULT_DETALIZATION) :
-            total(_total), detalization(_detalization) {
+    explicit ConsoleProgress(size_t _total,
+                             bool _stream_output = false,
+                             size_t _percent_to_update = DEFAULT_PERCENT_TO_UPDATE_PROGRESS,
+                             size_t _detalization = DEFAULT_DETALIZATION) :
+            total(_total), detalization(_detalization), percent_to_update(_percent_to_update) {
        stream_output = _stream_output;
        if (total == 0) {
            total = 1;
        }
-        std::cout << std::unitbuf;
    }

    /**
     * @brief Shows progress with current data. Progress is shown from the beginning of the current line.
-     * @return
     */
    void showProgress() const {
        std::stringstream strm;
@@ -45,28 +50,34 @@ public:
        }
        strm << "Progress: [";
        size_t i = 0;
-        for (; i < detalization * current / total; i++) {
+        for (; i < detalization * cur_progress / total; i++) {
            strm << ".";
        }
        for (; i < detalization; i++) {
            strm << " ";
        }
-        strm << "] " << std::fixed << std::setprecision(2) << 100 * static_cast<float>(current) / total << "% done";
+        strm << "] " << std::setw(3) << 100 * cur_progress / total << "% done";
        if (stream_output) {
-            std::cout << strm.str() << std::endl;
-        } else {
-            std::cout << strm.str() << std::flush;
+            strm << std::endl;
        }
+        std::fputs(strm.str().c_str(), stdout);
+        std::fflush(stdout);
    }

    /**
     * @brief Updates current value and progressbar
-     * @param newProgress - new value to represent
     */
-    void updateProgress(size_t newProgress) {
-        current = newProgress;
-        if (current > total) current = total;
-        showProgress();
+    void updateProgress() {
+        if (cur_progress > total) cur_progress = total;
+        size_t prev_percent = 100 * prev_progress / total;
+        size_t cur_percent = 100 * cur_progress / total;
+
+        if (prev_progress == 0 ||
+            cur_progress == total ||
+            prev_percent + percent_to_update <= cur_percent) {
+            showProgress();
+            prev_progress = cur_progress;
+        }
    }

    /**
@@ -74,10 +85,11 @@ public:
     * @param add - value to add
     */
    void addProgress(int add) {
-        if (add < 0 && -add > static_cast<int>(current)) {
-            add = -static_cast<int>(current);
+        if (add < 0 && -add > static_cast<int>(cur_progress)) {
+            add = -static_cast<int>(cur_progress);
        }
-        updateProgress(current + add);
+        cur_progress += add;
+        updateProgress();
    }

    /**
@@ -85,6 +97,9 @@ public:
     * @return
     */
    void finish() {
-        std::cerr << std::nounitbuf << "\n";
+        std::stringstream strm;
+        strm << std::endl;
+        std::fputs(strm.str().c_str(), stdout);
+        std::fflush(stdout);
    }
 };
--- a/inference-engine/samples/hello_classification/README.md
+++ b/inference-engine/samples/hello_classification/README.md
@@ -10,8 +10,8 @@ It demonstrates how to use the following Inference Engine API in applications:

 There is also an API introduced to crop a ROI object and set it as input without additional memory re-allocation.
 To properly demonstrate this API, it is required to run several networks in pipeline which is out of scope of this sample.
-Please refer to [Security Barrier Camera Demo](./inference-engine/samples/security_barrier_camera_demo/README.md), or
-[Crossroad Camera Demo](./inference-engine/samples/crossroad_camera_demo/README.md) with an example of using of new crop ROI API.
+Please refer to [Security Barrier Camera Demo](./demos/security_barrier_camera_demo/README.md), or
+[Crossroad Camera Demo](./demos/crossroad_camera_demo/README.md) with an example of using of new crop ROI API.

 Refer to [Integrate the Inference Engine New Request API with Your Application](./docs/IE_DG/Integrate_with_customer_application_new_API.md) for details.

--- a/inference-engine/samples/hello_query_device/README.md
+++ b/inference-engine/samples/hello_query_device/README.md
@@ -1,8 +1,8 @@
 # Hello Query Device C++ Sample

-This topic demonstrates how to run the Hello Query Device sample application, which queries Inference Engine devices and prints their metrics and default configuration values. The sample shows how to use [Query Device API feature](./docs/IE_DG/QueryDeviceAPI.md).
+This topic demonstrates how to run the Hello Query Device sample application, which queries Inference Engine devices and prints their metrics and default configuration values. The sample shows how to use [Query Device API feature](./docs/IE_DG/InferenceEngine_QueryAPI.md).
 > **NOTE:** This topic describes usage of C++ implementation of the Query Device Sample. 
-> For the Python* implementation, refer to [Hello Query Device Python* Sample](./inference-engine/ie_brudges/python/sample/hello_query_device/README.md)
+> For the Python* implementation, refer to [Hello Query Device Python* Sample](./inference-engine/ie_bridges/python/sample/hello_query_device/README.md)
 ## Running

 To see quired information, run the following:
--- a/inference-engine/samples/object_detection_sample_ssd/README.md
+++ b/inference-engine/samples/object_detection_sample_ssd/README.md
@@ -3,6 +3,8 @@
 This topic demonstrates how to run the Object Detection sample application, which does inference using object detection
 networks like SSD-VGG on Intel® Processors and Intel® HD Graphics.

+> **NOTE:** This topic describes usage of C++ implementation of the Object Detection Sample SSD. For the Python* implementation, refer to [Object Detection Python* Sample SSD](./inference-engine/ie_bridges/python/sample/object_detection_sample_ssd/README.md).
+
 ## How It Works

 Upon the start-up the sample application reads command line parameters and loads a network and an image to the Inference
--- a/inference-engine/samples/speech_sample/README.md
+++ b/inference-engine/samples/speech_sample/README.md
@@ -48,17 +48,15 @@ will be removed in GNA hardware version 3 and higher.
 #### Execution Modes

 Several execution modes are supported via the `-d` flag.  If the device
-is set to `CPU` and the GNA plugin is selected, the GNA device is
-emulated in fast-but-not-bit-exact mode.  If the device is set to
-`GNA_AUTO`, then the GNA hardware is used if available and the driver is
-installed.  Otherwise, the GNA device is emulated in
-fast-but-not-bit-exact mode.  If the device is set to `GNA_HW`, then the
-GNA hardware is used if available and the driver is installed.
+is set to `CPU` mode, then all calculation will be performed  on CPU device
+using CPU Plugin.  If the device is set to `GNA_AUTO`, then the GNA hardware is
+used if available and the driver is installed.  Otherwise, the GNA device is 
+emulated in fast-but-not-bit-exact mode.  If the device is set to `GNA_HW`,
+then the GNA hardware is used if available and the driver is installed.
 Otherwise, an error will occur.  If the device is set to `GNA_SW`, the
 GNA device is emulated in fast-but-not-bit-exact mode.  Finally, if
 the device is set to `GNA_SW_EXACT`, the GNA device is emulated in
 bit-exact mode.
-`GNA_SW_FP32` mode is used for calculation on CPU device using GNA Plugin.

 #### Loading and Saving Models

@@ -94,7 +92,7 @@ Options:
    -m "<path>"             Required. Path to an .xml file with a trained model (required if -rg is missing).
    -o "<path>"             Optional. Output file name (default name is "scores.ark").
    -l "<absolute_path>"    Required for CPU custom layers. Absolute path to a shared library with the kernel implementations.
-    -d "<device>"           Optional. Specify a target device to infer on. CPU, GPU, GNA_AUTO, GNA_HW, GNA_SW, GNA_SW_EXACT, GNA_SW_FP32 and HETERO with combination of GNA
+    -d "<device>"           Optional. Specify a target device to infer on. CPU, GPU, GNA_AUTO, GNA_HW, GNA_SW, GNA_SW_EXACT and HETERO with combination of GNA
     as the primary device and CPU as a secondary (e.g. HETERO:GNA,CPU) are supported. The list of available devices is shown below. The sample will look for a suitable plugin for device specified.
    -p                      Optional. Plugin name. For example, GPU. If this parameter is set, the sample will look for this plugin only
    -pc                     Optional. Enables performance report
--- a/inference-engine/samples/speech_sample/main.cpp
+++ b/inference-engine/samples/speech_sample/main.cpp
@@ -706,7 +706,7 @@ int main(int argc, char *argv[]) {
            outputInfo = netBuilder.getNetwork().getOutputsInfo();
        }

-        Blob::Ptr ptrOutputBlob = inferRequests[0].inferRequest.GetBlob(cOutputInfo.rbegin()->first);
+        Blob::Ptr ptrOutputBlob = inferRequests.begin()->inferRequest.GetBlob(cOutputInfo.rbegin()->first);

        for (auto &item : outputInfo) {
            DataPtr outData = item.second;
@@ -839,7 +839,7 @@ int main(int argc, char *argv[]) {
                            if (!FLAGS_o.empty()) {
                                outputFrame =
                                        &ptrScores.front() + numScoresPerFrame * sizeof(float) * (inferRequest.frameIndex);
-                                Blob::Ptr outputBlob = inferRequest.inferRequest.GetBlob(cOutputInfo.begin()->first);
+                                Blob::Ptr outputBlob = inferRequest.inferRequest.GetBlob(cOutputInfo.rbegin()->first);
                                auto byteSize = inferRequest.numFramesThisBatch * numScoresPerFrame * sizeof(float);
                                std::memcpy(outputFrame,
                                            outputBlob->buffer(),
@@ -848,7 +848,7 @@ int main(int argc, char *argv[]) {

                            if (!FLAGS_r.empty()) {
                                Blob::Ptr outputBlob = inferRequest.inferRequest.GetBlob(cOutputInfo.begin()->first);
-                                CompareScores(outputBlob->buffer().as<float *>(),
+                                CompareScores(outputBlob->buffer().as<float*>(),
                                              &ptrReferenceScores[inferRequest.frameIndex *
                                                                  numFrameElementsReference *
                                                                  numBytesPerElementReference],
@@ -876,7 +876,7 @@ int main(int argc, char *argv[]) {
                        ptrInputBlobs.push_back(inferRequest.inferRequest.GetBlob(input.first));
                    }

-                    for (size_t i = 0; i < numInputArkFiles; i++) {
+                    for (size_t i = 0; i < numInputArkFiles; ++i) {
                        std::memcpy(ptrInputBlobs[i]->buffer(),
                                    inputFrame[i],
                                    ptrInputBlobs[i]->byteSize());
@@ -890,14 +890,14 @@ int main(int argc, char *argv[]) {
                    frameIndex += numFramesThisBatch;
                    for (size_t j = 0; j < inputArkFiles.size(); j++) {
                        if (FLAGS_cw_l > 0 || FLAGS_cw_r > 0) {
-                            int i = frameIndex - FLAGS_cw_l;
-                            if (i > 0 && i < static_cast<int>(numFramesArkFile)) {
+                            int idx = frameIndex - FLAGS_cw_l;
+                            if (idx > 0 && idx < static_cast<int>(numFramesArkFile)) {
                                inputFrame[j] += sizeof(float) * numFrameElementsInput[j] * numFramesThisBatch;
-                            } else if (i >= static_cast<int>(numFramesArkFile)) {
-                                inputFrame[j] = &ptrUtterances[0].front() +
+                            } else if (idx >= static_cast<int>(numFramesArkFile)) {
+                                inputFrame[j] = &ptrUtterances[j].front() +
                                        (numFramesArkFile - 1) * sizeof(float) * numFrameElementsInput[j] * numFramesThisBatch;
-                            } else if (i < 0) {
-                                inputFrame[j] = &ptrUtterances[0].front();
+                            } else if (idx <= 0) {
+                                inputFrame[j] = &ptrUtterances[j].front();
                            }
                        } else {
                            inputFrame[j] += sizeof(float) * numFrameElementsInput[j] * numFramesThisBatch;
@@ -905,7 +905,6 @@ int main(int argc, char *argv[]) {
                    }
                    inferRequestFetched |= true;
                }
-
                if (!inferRequestFetched) {
                    std::this_thread::sleep_for(std::chrono::milliseconds(1));
                    continue;
--- a/inference-engine/samples/speech_sample/speech_sample.hpp
+++ b/inference-engine/samples/speech_sample/speech_sample.hpp
@@ -23,7 +23,7 @@ static const char plugin_message[] = "Plugin name. For example MKLDNNPlugin. If
                                     "the sample will look for this plugin only";

 /// @brief message for assigning cnn calculation to device
-static const char target_device_message[] = "Specify a target device to infer on. CPU, GPU, GNA_AUTO, GNA_HW, GNA_SW, GNA_SW_FP32 "
+static const char target_device_message[] = "Specify a target device to infer on. CPU, GPU, GNA_AUTO, GNA_HW, GNA_SW, "
                                            "GNA_SW_EXACT and HETERO with combination of GNA as the primary device and CPU"
                                            " as a secondary (e.g. HETERO:GNA,CPU) are supported. The list of available devices is shown below. "
                                            "The sample will look for a suitable plugin for device specified.";
--- a/inference-engine/samples/thirdparty/gflags/.gitmodules
+++ b/inference-engine/samples/thirdparty/gflags/.gitmodules
@@ -1,4 +0,0 @@
-[submodule "doc"]
-	path = doc
-	url = https://github.com/gflags/gflags.git
-	branch = gh-pages