[CPP Speech Sample] Improve -o and -oname flags (#10321)

* Improve `-o` and `-oname` flags * Apply clang-format tool * fix saving output files * Apply clang-format * Fix error when `-oname` not specified * apply clang format * Fix error `-oname` * Use output name with port to find model output * fix comment line breaking * fix comparison with reference for multiple outputs * Fix output name printing error * try to fix clang format * fix problem with bs > 1 * minimal change to rerun test pipeline * clang format * Revert "Fix error `-oname`" This reverts commit c33d5f16e8.
2022-02-25 11:25:35 +03:00
parent 9e3610c028
commit bacf597516
2 changed files with 273 additions and 240 deletions
--- a/samples/cpp/speech_sample/README.md
+++ b/samples/cpp/speech_sample/README.md
@@ -1,6 +1,6 @@
 # Automatic Speech Recognition C++ Sample {#openvino_inference_engine_samples_speech_sample_README}

-This sample demonstrates how to execute an Asynchronous Inference of acoustic model based on Kaldi\* neural networks and speech feature vectors.
+This sample demonstrates how to execute an Asynchronous Inference of acoustic model based on Kaldi\* neural networks and speech feature vectors.  

 The sample works with Kaldi ARK or Numpy* uncompressed NPZ files, so it does not cover an end-to-end speech recognition scenario (speech to text), requiring additional preprocessing (feature extraction) to get a feature vector from a speech signal, as well as postprocessing (decoding) to produce text from scores.

--- a/samples/cpp/speech_sample/main.cpp
+++ b/samples/cpp/speech_sample/main.cpp
@@ -86,10 +86,11 @@ int main(int argc, char* argv[]) {
        uint32_t batchSize = (FLAGS_cw_r > 0 || FLAGS_cw_l > 0 || !FLAGS_bs) ? 1 : (uint32_t)FLAGS_bs;
        std::shared_ptr<ov::Model> model;
        std::vector<std::string> outputs;
+        std::vector<std::string> output_names;
        std::vector<size_t> ports;
        // --------------------------- Processing custom outputs ---------------------------------------------
        if (!FLAGS_oname.empty()) {
-            std::vector<std::string> output_names = convert_str_to_vector(FLAGS_oname);
+            output_names = convert_str_to_vector(FLAGS_oname);
            for (const auto& output_name : output_names) {
                auto pos_layer = output_name.rfind(":");
                if (pos_layer == std::string::npos) {
@@ -248,10 +249,9 @@ int main(int argc, char* argv[]) {
        auto t0 = Time::now();
        ms loadTime = std::chrono::duration_cast<ms>(Time::now() - t0);
        slog::info << "Model loading time " << loadTime.count() << " ms" << slog::endl;
-        slog::info << "Loading model to the device " << FLAGS_d << slog::endl;
        ov::CompiledModel executableNet;
        if (!FLAGS_m.empty()) {
-            slog::info << "Loading model to the device" << slog::endl;
+            slog::info << "Loading model to the device " << FLAGS_d << slog::endl;
            executableNet = core.compile_model(model, deviceStr, genericPluginConfig);
        } else {
            slog::info << "Importing model to the device" << slog::endl;
@@ -344,157 +344,184 @@ int main(int argc, char* argv[]) {
        }
        // -----------------------------------------------------------------------------------------------------
        // --------------------------- Step 5. Do inference --------------------------------------------------------
-        for (size_t next_output = 0; next_output < count_file; next_output++) {
-            std::vector<std::vector<uint8_t>> ptrUtterances;
-            std::vector<uint8_t> ptrScores;
-            std::vector<uint8_t> ptrReferenceScores;
-            ScoreErrorT frameError, totalError;
-            ptrUtterances.resize(inputFiles.size());
-            // initialize memory state before starting
-            for (auto&& state : inferRequests.begin()->inferRequest.query_state()) {
-                state.reset();
-            }
-            /** Work with each utterance **/
-            for (uint32_t utteranceIndex = 0; utteranceIndex < numUtterances; ++utteranceIndex) {
-                std::map<std::string, ov::ProfilingInfo> utterancePerfMap;
-                uint64_t totalNumberOfRunsOnHw = 0;
-                std::string uttName;
-                uint32_t numFrames(0), n(0);
-                std::vector<uint32_t> numFrameElementsInput;
-                uint32_t numFramesReference(0), numFrameElementsReference(0), numBytesPerElementReference(0),
-                    numBytesReferenceScoreThisUtterance(0);
-                auto dims = executableNet.outputs()[0].get_shape();
-                const auto numScoresPerFrame =
-                    std::accumulate(std::begin(dims), std::end(dims), size_t{1}, std::multiplies<size_t>());
-                slog::info << "Number scores per frame : " << numScoresPerFrame << slog::endl;
-                /** Get information from input file for current utterance **/
-                numFrameElementsInput.resize(numInputFiles);
-                for (size_t i = 0; i < inputFiles.size(); i++) {
-                    std::vector<uint8_t> ptrUtterance;
-                    auto inputFilename = inputFiles[i].c_str();
-                    uint32_t currentNumFrames(0), currentNumFrameElementsInput(0), currentNumBytesPerElementInput(0);
-                    file->get_file_info(inputFilename, utteranceIndex, &n, &numBytesThisUtterance[i]);
-                    ptrUtterance.resize(numBytesThisUtterance[i]);
-                    file->load_file(inputFilename,
-                                    utteranceIndex,
-                                    uttName,
-                                    ptrUtterance,
-                                    &currentNumFrames,
-                                    &currentNumFrameElementsInput,
-                                    &currentNumBytesPerElementInput);
-                    if (numFrames == 0) {
-                        numFrames = currentNumFrames;
-                    } else if (numFrames != currentNumFrames) {
-                        std::string errMessage("Number of frames in input files is different: " +
-                                               std::to_string(numFrames) + " and " + std::to_string(currentNumFrames));
-                        throw std::logic_error(errMessage);
-                    }
-                    ptrUtterances[i] = ptrUtterance;
-                    numFrameElementsInput[i] = currentNumFrameElementsInput;
-                }
-                int i = 0;
-                for (auto& ptrInputBlob : ptrInputBlobs) {
-                    if (ptrInputBlob.get_size() != numFrameElementsInput[i++] * batchSize) {
-                        throw std::logic_error("network input size(" + std::to_string(ptrInputBlob.get_size()) +
-                                               ") mismatch to input file size (" +
-                                               std::to_string(numFrameElementsInput[i - 1] * batchSize) + ")");
-                    }
-                }
-                ptrScores.resize(numFrames * numScoresPerFrame * sizeof(float));
-                if (!FLAGS_r.empty()) {
-                    /** Read file with reference scores **/
-                    BaseFile* fileReferenceScores;
-                    auto exReferenceScoresFile = fileExt(FLAGS_r);
-                    if (exReferenceScoresFile == "ark") {
-                        fileReferenceScores = &arkFile;
-                    } else if (exReferenceScoresFile == "npz") {
-                        fileReferenceScores = &numpyFile;
-                    } else {
-                        throw std::logic_error("Invalid Reference Scores file");
-                    }
-                    std::string refUtteranceName;
-                    fileReferenceScores->get_file_info(reference_name_files[next_output].c_str(),
-                                                       utteranceIndex,
-                                                       &n,
-                                                       &numBytesReferenceScoreThisUtterance);
-                    ptrReferenceScores.resize(numBytesReferenceScoreThisUtterance);
-                    fileReferenceScores->load_file(reference_name_files[next_output].c_str(),
-                                                   utteranceIndex,
-                                                   refUtteranceName,
-                                                   ptrReferenceScores,
-                                                   &numFramesReference,
-                                                   &numFrameElementsReference,
-                                                   &numBytesPerElementReference);
-                }
-                double totalTime = 0.0;
-                std::cout << "Utterance " << utteranceIndex << ": " << std::endl;
-                clear_score_error(&totalError);
-                totalError.threshold = frameError.threshold = MAX_SCORE_DIFFERENCE;
-                auto outputFrame = &ptrScores.front();
-                std::vector<uint8_t*> inputFrame;
-                for (auto& ut : ptrUtterances) {
-                    inputFrame.push_back(&ut.front());
-                }
-                std::map<std::string, ov::ProfilingInfo> callPerfMap;
-                size_t frameIndex = 0;
-                uint32_t numFramesFile = numFrames;
-                numFrames += FLAGS_cw_l + FLAGS_cw_r;
-                uint32_t numFramesThisBatch{batchSize};
-                auto t0 = Time::now();
-                auto t1 = t0;
-                while (frameIndex <= numFrames) {
-                    if (frameIndex == numFrames) {
-                        if (std::find_if(inferRequests.begin(), inferRequests.end(), [&](InferRequestStruct x) {
-                                return (x.frameIndex != -1);
-                            }) == inferRequests.end()) {
-                            break;
-                        }
-                    }
-                    bool inferRequestFetched = false;
-                    /** Start inference loop **/
-                    for (auto& inferRequest : inferRequests) {
-                        if (frameIndex == numFrames) {
-                            numFramesThisBatch = 1;
-                        } else {
-                            numFramesThisBatch =
-                                (numFrames - frameIndex < batchSize) ? (numFrames - frameIndex) : batchSize;
-                        }
+        std::vector<std::vector<uint8_t>> ptrUtterances;
+        std::vector<std::vector<uint8_t>> vectorPtrScores((outputs.size() == 0) ? 1 : outputs.size());
+        std::vector<uint16_t> numScoresPerOutput((outputs.size() == 0) ? 1 : outputs.size());
+        std::vector<std::vector<uint8_t>> vectorPtrReferenceScores(reference_name_files.size());
+        std::vector<ScoreErrorT> vectorFrameError(reference_name_files.size()),
+            vectorTotalError(reference_name_files.size());
+        ptrUtterances.resize(inputFiles.size());
+        // initialize memory state before starting
+        for (auto&& state : inferRequests.begin()->inferRequest.query_state()) {
+            state.reset();
+        }
+        /** Work with each utterance **/
+        for (uint32_t utteranceIndex = 0; utteranceIndex < numUtterances; ++utteranceIndex) {
+            std::map<std::string, ov::ProfilingInfo> utterancePerfMap;
+            uint64_t totalNumberOfRunsOnHw = 0;
+            std::string uttName;
+            uint32_t numFrames(0), n(0);
+            std::vector<uint32_t> numFrameElementsInput;
+            std::vector<uint32_t> numFramesReference(reference_name_files.size()),
+                numFrameElementsReference(reference_name_files.size()),
+                numBytesPerElementReference(reference_name_files.size()),
+                numBytesReferenceScoreThisUtterance(reference_name_files.size());
+
+            /** Get information from input file for current utterance **/
+            numFrameElementsInput.resize(numInputFiles);
+            for (size_t i = 0; i < inputFiles.size(); i++) {
+                std::vector<uint8_t> ptrUtterance;
+                auto inputFilename = inputFiles[i].c_str();
+                uint32_t currentNumFrames(0), currentNumFrameElementsInput(0), currentNumBytesPerElementInput(0);
+                file->get_file_info(inputFilename, utteranceIndex, &n, &numBytesThisUtterance[i]);
+                ptrUtterance.resize(numBytesThisUtterance[i]);
+                file->load_file(inputFilename,
+                                utteranceIndex,
+                                uttName,
+                                ptrUtterance,
+                                &currentNumFrames,
+                                &currentNumFrameElementsInput,
+                                &currentNumBytesPerElementInput);
+                if (numFrames == 0) {
+                    numFrames = currentNumFrames;
+                } else if (numFrames != currentNumFrames) {
+                    std::string errMessage("Number of frames in input files is different: " +
+                                           std::to_string(numFrames) + " and " + std::to_string(currentNumFrames));
+                    throw std::logic_error(errMessage);
+                }
+                ptrUtterances[i] = ptrUtterance;
+                numFrameElementsInput[i] = currentNumFrameElementsInput;
+            }
+            int i = 0;
+            for (auto& ptrInputBlob : ptrInputBlobs) {
+                if (ptrInputBlob.get_size() != numFrameElementsInput[i++] * batchSize) {
+                    throw std::logic_error("network input size(" + std::to_string(ptrInputBlob.get_size()) +
+                                           ") mismatch to input file size (" +
+                                           std::to_string(numFrameElementsInput[i - 1] * batchSize) + ")");
+                }
+            }
+
+            double totalTime = 0.0;
+
+            for (size_t errorIndex = 0; errorIndex < vectorFrameError.size(); errorIndex++) {
+                clear_score_error(&vectorTotalError[errorIndex]);
+                vectorTotalError[errorIndex].threshold = vectorFrameError[errorIndex].threshold = MAX_SCORE_DIFFERENCE;
+            }
+
+            std::vector<uint8_t*> inputFrame;
+            for (auto& ut : ptrUtterances) {
+                inputFrame.push_back(&ut.front());
+            }
+            std::map<std::string, ov::ProfilingInfo> callPerfMap;
+            size_t frameIndex = 0;
+            uint32_t numFramesFile = numFrames;
+            numFrames += FLAGS_cw_l + FLAGS_cw_r;
+            uint32_t numFramesThisBatch{batchSize};
+            auto t0 = Time::now();
+            auto t1 = t0;
+
+            BaseFile* fileReferenceScores;
+            std::string refUtteranceName;
+
+            if (!FLAGS_r.empty()) {
+                /** Read file with reference scores **/
+                auto exReferenceScoresFile = fileExt(FLAGS_r);
+                if (exReferenceScoresFile == "ark") {
+                    fileReferenceScores = &arkFile;
+                } else if (exReferenceScoresFile == "npz") {
+                    fileReferenceScores = &numpyFile;
+                } else {
+                    throw std::logic_error("Invalid Reference Scores file");
+                }
+                for (size_t next_output = 0; next_output < count_file; next_output++) {
+                    if (fileReferenceScores != nullptr) {
+                        fileReferenceScores->get_file_info(reference_name_files[next_output].c_str(),
+                                                           utteranceIndex,
+                                                           &n,
+                                                           &numBytesReferenceScoreThisUtterance[next_output]);
+                        vectorPtrReferenceScores[next_output].resize(numBytesReferenceScoreThisUtterance[next_output]);
+                        fileReferenceScores->load_file(reference_name_files[next_output].c_str(),
+                                                       utteranceIndex,
+                                                       refUtteranceName,
+                                                       vectorPtrReferenceScores[next_output],
+                                                       &numFramesReference[next_output],
+                                                       &numFrameElementsReference[next_output],
+                                                       &numBytesPerElementReference[next_output]);
+                    }
+                }
+            }
+
+            while (frameIndex <= numFrames) {
+                if (frameIndex == numFrames) {
+                    if (std::find_if(inferRequests.begin(), inferRequests.end(), [&](InferRequestStruct x) {
+                            return (x.frameIndex != -1);
+                        }) == inferRequests.end()) {
+                        break;
+                    }
+                }
+                bool inferRequestFetched = false;
+                /** Start inference loop **/
+                for (auto& inferRequest : inferRequests) {
+                    if (frameIndex == numFrames) {
+                        numFramesThisBatch = 1;
+                    } else {
+                        numFramesThisBatch =
+                            (numFrames - frameIndex < batchSize) ? (numFrames - frameIndex) : batchSize;
+                    }
+
+                    /* waits until inference result becomes available */
+                    if (inferRequest.frameIndex != -1) {
+                        inferRequest.inferRequest.wait();
+                        if (inferRequest.frameIndex >= 0)
+                            for (size_t next_output = 0; next_output < count_file; next_output++) {
+                                std::string outputName = (outputs.size() == 0) ? executableNet.output(0).get_any_name()
+                                                                               : output_names[next_output];
+                                auto dims = executableNet.output(outputName).get_shape();
+                                numScoresPerOutput[next_output] = std::accumulate(std::begin(dims),
+                                                                                  std::end(dims),
+                                                                                  size_t{1},
+                                                                                  std::multiplies<size_t>());
+
+                                vectorPtrScores[next_output].resize(numFramesFile * numScoresPerOutput[next_output] *
+                                                                    sizeof(float));

-                        /* waits until inference result becomes available */
-                        if (inferRequest.frameIndex != -1) {
-                            inferRequest.inferRequest.wait();
-                            if (inferRequest.frameIndex >= 0) {
                                if (!FLAGS_o.empty()) {
                                    /* Prepare output data for save to file in future */
-                                    outputFrame = &ptrScores.front() +
-                                                  numScoresPerFrame * sizeof(float) * (inferRequest.frameIndex);
+                                    auto outputFrame =
+                                        &vectorPtrScores[next_output].front() +
+                                        numScoresPerOutput[next_output] * sizeof(float) * (inferRequest.frameIndex);

                                    ov::Tensor outputBlob =
-                                        inferRequest.inferRequest.get_tensor(executableNet.outputs()[0]);
+                                        inferRequest.inferRequest.get_tensor(executableNet.output(outputName));
                                    if (!outputs.empty()) {
                                        outputBlob =
-                                            inferRequest.inferRequest.get_tensor(executableNet.output(FLAGS_oname));
+                                            inferRequest.inferRequest.get_tensor(executableNet.output(outputName));
                                    }
-                                    // locked memory holder should be alive all time while access to its buffer
-                                    // happens
-                                    auto byteSize = numScoresPerFrame * sizeof(float);
+                                    // locked memory holder should be alive all time while access to its buffer happens
+                                    auto byteSize = numScoresPerOutput[next_output] * sizeof(float);
                                    std::memcpy(outputFrame, outputBlob.data<float>(), byteSize);
                                }
                                if (!FLAGS_r.empty()) {
                                    /** Compare output data with reference scores **/
                                    ov::Tensor outputBlob =
-                                        inferRequest.inferRequest.get_tensor(executableNet.outputs()[0]);
-                                    if (!FLAGS_oname.empty())
-                                        outputBlob =
-                                            inferRequest.inferRequest.get_tensor(executableNet.output(FLAGS_oname));
-                                    compare_scores(
-                                        outputBlob.data<float>(),
-                                        &ptrReferenceScores[inferRequest.frameIndex * numFrameElementsReference *
-                                                            numBytesPerElementReference],
-                                        &frameError,
-                                        inferRequest.numFramesThisBatch,
-                                        numFrameElementsReference);
-                                    update_score_error(&frameError, &totalError);
+                                        inferRequest.inferRequest.get_tensor(executableNet.output(outputName));
+
+                                    if (numScoresPerOutput[next_output] / numFrameElementsReference[next_output] ==
+                                        batchSize) {
+                                        compare_scores(
+                                            outputBlob.data<float>(),
+                                            &vectorPtrReferenceScores[next_output]
+                                                                     [inferRequest.frameIndex *
+                                                                      numFrameElementsReference[next_output] *
+                                                                      numBytesPerElementReference[next_output]],
+                                            &vectorFrameError[next_output],
+                                            inferRequest.numFramesThisBatch,
+                                            numFrameElementsReference[next_output]);
+                                        update_score_error(&vectorFrameError[next_output],
+                                                           &vectorTotalError[next_output]);
+                                    } else {
+                                        throw std::logic_error("Number of output and reference frames does not match.");
+                                    }
                                }
                                if (FLAGS_pc) {
                                    // retrieve new counters
@@ -503,90 +530,108 @@ int main(int argc, char* argv[]) {
                                    sum_performance_counters(callPerfMap, utterancePerfMap, totalNumberOfRunsOnHw);
                                }
                            }
-                            // -----------------------------------------------------------------------------------------------------
-                        }
-                        if (frameIndex == numFrames) {
-                            inferRequest.frameIndex = -1;
-                            continue;
-                        }
-                        ptrInputBlobs.clear();
-                        if (FLAGS_iname.empty()) {
-                            for (auto& input : cInputInfo) {
-                                ptrInputBlobs.push_back(inferRequest.inferRequest.get_tensor(input));
-                            }
-                        } else {
-                            std::vector<std::string> inputNameBlobs = convert_str_to_vector(FLAGS_iname);
-                            for (const auto& input : inputNameBlobs) {
-                                ov::Tensor blob = inferRequests.begin()->inferRequest.get_tensor(input);
-                                if (!blob) {
-                                    std::string errMessage("No blob with name : " + input);
-                                    throw std::logic_error(errMessage);
-                                }
-                                ptrInputBlobs.push_back(blob);
-                            }
-                        }
-
-                        /** Iterate over all the input blobs **/
-                        for (size_t i = 0; i < numInputFiles; ++i) {
-                            ov::Tensor minput = ptrInputBlobs[i];
-                            if (!minput) {
-                                std::string errMessage("We expect ptrInputBlobs[" + std::to_string(i) +
-                                                       "] to be inherited from Tensor, " +
-                                                       "but in fact we were not able to cast input to Tensor");
-                                throw std::logic_error(errMessage);
-                            }
-                            memcpy(minput.data<float>(), inputFrame[i], minput.get_byte_size());
-                            // Used to infer fewer frames than the batch size
-                            if (batchSize != numFramesThisBatch) {
-                                memset(minput.data<float>() + numFramesThisBatch * numFrameElementsInput[i],
-                                       0,
-                                       (batchSize - numFramesThisBatch) * numFrameElementsInput[i]);
-                            }
-                        }
                        // -----------------------------------------------------------------------------------------------------
-                        int index = static_cast<int>(frameIndex) - (FLAGS_cw_l + FLAGS_cw_r);
-                        /* Starting inference in asynchronous mode*/
-                        inferRequest.inferRequest.start_async();
-                        inferRequest.frameIndex = index < 0 ? -2 : index;
-                        inferRequest.numFramesThisBatch = numFramesThisBatch;
-                        frameIndex += numFramesThisBatch;
-                        for (size_t j = 0; j < inputFiles.size(); j++) {
-                            if (FLAGS_cw_l > 0 || FLAGS_cw_r > 0) {
-                                int idx = frameIndex - FLAGS_cw_l;
-                                if (idx > 0 && idx < static_cast<int>(numFramesFile)) {
-                                    inputFrame[j] += sizeof(float) * numFrameElementsInput[j] * numFramesThisBatch;
-                                } else if (idx >= static_cast<int>(numFramesFile)) {
-                                    inputFrame[j] = &ptrUtterances[j].front() + (numFramesFile - 1) * sizeof(float) *
-                                                                                    numFrameElementsInput[j] *
-                                                                                    numFramesThisBatch;
-                                } else if (idx <= 0) {
-                                    inputFrame[j] = &ptrUtterances[j].front();
-                                }
-                            } else {
-                                inputFrame[j] += sizeof(float) * numFrameElementsInput[j] * numFramesThisBatch;
-                            }
-                        }
-                        inferRequestFetched |= true;
                    }
-                    /** Inference was finished for current frame **/
-                    if (!inferRequestFetched) {
-                        std::this_thread::sleep_for(std::chrono::milliseconds(1));
+                    if (frameIndex == numFrames) {
+                        inferRequest.frameIndex = -1;
                        continue;
                    }
-                }
-                t1 = Time::now();
-                fsec fs = t1 - t0;
-                ms d = std::chrono::duration_cast<ms>(fs);
-                totalTime += d.count();
-                // resetting state between utterances
-                for (auto&& state : inferRequests.begin()->inferRequest.query_state()) {
-                    state.reset();
-                }
-                // -----------------------------------------------------------------------------------------------------
+                    ptrInputBlobs.clear();
+                    if (FLAGS_iname.empty()) {
+                        for (auto& input : cInputInfo) {
+                            ptrInputBlobs.push_back(inferRequest.inferRequest.get_tensor(input));
+                        }
+                    } else {
+                        std::vector<std::string> inputNameBlobs = convert_str_to_vector(FLAGS_iname);
+                        for (const auto& input : inputNameBlobs) {
+                            ov::Tensor blob = inferRequests.begin()->inferRequest.get_tensor(input);
+                            if (!blob) {
+                                std::string errMessage("No blob with name : " + input);
+                                throw std::logic_error(errMessage);
+                            }
+                            ptrInputBlobs.push_back(blob);
+                        }
+                    }

-                // --------------------------- Step 6. Process output
-                // -------------------------------------------------------
+                    /** Iterate over all the input blobs **/
+                    for (size_t i = 0; i < numInputFiles; ++i) {
+                        ov::Tensor minput = ptrInputBlobs[i];
+                        if (!minput) {
+                            std::string errMessage("We expect ptrInputBlobs[" + std::to_string(i) +
+                                                   "] to be inherited from Tensor, " +
+                                                   "but in fact we were not able to cast input to Tensor");
+                            throw std::logic_error(errMessage);
+                        }
+                        memcpy(minput.data<float>(), inputFrame[i], minput.get_byte_size());
+                        // Used to infer fewer frames than the batch size
+                        if (batchSize != numFramesThisBatch) {
+                            memset(minput.data<float>() + numFramesThisBatch * numFrameElementsInput[i],
+                                   0,
+                                   (batchSize - numFramesThisBatch) * numFrameElementsInput[i]);
+                        }
+                    }
+                    // -----------------------------------------------------------------------------------------------------
+                    int index = static_cast<int>(frameIndex) - (FLAGS_cw_l + FLAGS_cw_r);
+                    /* Starting inference in asynchronous mode*/
+                    inferRequest.inferRequest.start_async();
+                    inferRequest.frameIndex = index < 0 ? -2 : index;
+                    inferRequest.numFramesThisBatch = numFramesThisBatch;
+                    frameIndex += numFramesThisBatch;
+                    for (size_t j = 0; j < inputFiles.size(); j++) {
+                        if (FLAGS_cw_l > 0 || FLAGS_cw_r > 0) {
+                            int idx = frameIndex - FLAGS_cw_l;
+                            if (idx > 0 && idx < static_cast<int>(numFramesFile)) {
+                                inputFrame[j] += sizeof(float) * numFrameElementsInput[j] * numFramesThisBatch;
+                            } else if (idx >= static_cast<int>(numFramesFile)) {
+                                inputFrame[j] = &ptrUtterances[j].front() + (numFramesFile - 1) * sizeof(float) *
+                                                                                numFrameElementsInput[j] *
+                                                                                numFramesThisBatch;
+                            } else if (idx <= 0) {
+                                inputFrame[j] = &ptrUtterances[j].front();
+                            }
+                        } else {
+                            inputFrame[j] += sizeof(float) * numFrameElementsInput[j] * numFramesThisBatch;
+                        }
+                    }
+                    inferRequestFetched |= true;
+                }
+                /** Inference was finished for current frame **/
+                if (!inferRequestFetched) {
+                    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+                    continue;
+                }
+            }
+            t1 = Time::now();
+            fsec fs = t1 - t0;
+            ms d = std::chrono::duration_cast<ms>(fs);
+            totalTime += d.count();
+            // resetting state between utterances
+            for (auto&& state : inferRequests.begin()->inferRequest.query_state()) {
+                state.reset();
+            }
+            // -----------------------------------------------------------------------------------------------------

+            // --------------------------- Step 6. Process output
+            // -------------------------------------------------------
+
+            /** Show performance results **/
+            std::cout << "Utterance " << utteranceIndex << ": " << std::endl;
+            std::cout << "Total time in Infer (HW and SW):\t" << totalTime << " ms" << std::endl;
+            std::cout << "Frames in utterance:\t\t\t" << numFrames << " frames" << std::endl;
+            std::cout << "Average Infer time per frame:\t\t" << totalTime / static_cast<double>(numFrames) << " ms\n"
+                      << std::endl;
+
+            if (FLAGS_pc) {
+                // print performance results
+                print_performance_counters(utterancePerfMap,
+                                           frameIndex,
+                                           std::cout,
+                                           getFullDeviceName(core, FLAGS_d),
+                                           totalNumberOfRunsOnHw,
+                                           FLAGS_d);
+            }
+
+            for (size_t next_output = 0; next_output < count_file; next_output++) {
                if (!FLAGS_o.empty()) {
                    auto exOutputScoresFile = fileExt(FLAGS_o);
                    if (exOutputScoresFile == "ark") {
@@ -601,33 +646,21 @@ int main(int argc, char* argv[]) {
                    fileOutput->save_file(output_name_files[next_output].c_str(),
                                          shouldAppend,
                                          uttName,
-                                          &ptrScores.front(),
+                                          &vectorPtrScores[next_output].front(),
                                          numFramesFile,
-                                          numScoresPerFrame);
-                }
-                /** Show performance results **/
-                std::cout << "Total time in Infer (HW and SW):\t" << totalTime << " ms" << std::endl;
-                std::cout << "Frames in utterance:\t\t\t" << numFrames << " frames" << std::endl;
-                std::cout << "Average Infer time per frame:\t\t" << totalTime / static_cast<double>(numFrames) << " ms"
-                          << std::endl;
-                if (FLAGS_pc) {
-                    // print performance results
-                    print_performance_counters(utterancePerfMap,
-                                               frameIndex,
-                                               std::cout,
-                                               getFullDeviceName(core, FLAGS_d),
-                                               totalNumberOfRunsOnHw,
-                                               FLAGS_d);
+                                          numScoresPerOutput[next_output]);
                }
                if (!FLAGS_r.empty()) {
                    // print statistical score error
-                    print_reference_compare_results(totalError, numFrames, std::cout);
+                    std::string outputName =
+                        (outputs.size() == 0) ? executableNet.output(0).get_any_name() : output_names[next_output];
+                    std::cout << "Output name: " << outputName << std::endl;
+                    std::cout << "Number scores per frame: " << numScoresPerOutput[next_output] / batchSize << std::endl
+                              << std::endl;
+                    print_reference_compare_results(vectorTotalError[next_output], numFrames, std::cout);
                }
-                std::cout << "End of Utterance " << utteranceIndex << std::endl << std::endl;
-                // -----------------------------------------------------------------------------------------------------
            }
        }
-        // -----------------------------------------------------------------------------------------------------
    } catch (const std::exception& error) {
        slog::err << error.what() << slog::endl;
        return 1;