From d2333cc01294f9aad582fc742906c3d0903055b5 Mon Sep 17 00:00:00 2001 From: Ilya Churaev Date: Fri, 17 Sep 2021 03:53:39 +0300 Subject: [PATCH 01/51] Introduced template for OV2.0 migration guide (#7360) * Introduced template for OV2.0 migration guide * Fixed comments * Fixed comments --- docs/index.md | 6 ++ .../docs/common_inference_pipeline.md | 55 +++++++++++++++++++ docs/migration_ov_2_0/docs/intro.md | 12 ++++ docs/snippets/ie_common.cpp | 43 +++++++++++++++ docs/snippets/ov_common.cpp | 34 ++++++++++++ .../include/openvino/runtime/runtime.hpp | 15 +++++ 6 files changed, 165 insertions(+) create mode 100644 docs/migration_ov_2_0/docs/common_inference_pipeline.md create mode 100644 docs/migration_ov_2_0/docs/intro.md create mode 100644 docs/snippets/ie_common.cpp create mode 100644 docs/snippets/ov_common.cpp create mode 100644 inference-engine/src/inference_engine/include/openvino/runtime/runtime.hpp diff --git a/docs/index.md b/docs/index.md index 9ad04bfe960..7cc4eb90f7b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -104,3 +104,9 @@ Intel® Distribution of OpenVINO™ toolkit includes the following components: - [Intel® Media SDK](https://software.intel.com/en-us/media-sdk) (in Intel® Distribution of OpenVINO™ toolkit for Linux only) OpenVINO™ Toolkit opensource version is available on [GitHub](https://github.com/openvinotoolkit/openvino). For building the Inference Engine from the source code, see the build instructions. + + +## OpenVINO™ API 2.0 + +The new OpenVINO™ API 2.0 was introduced to make OpenVINO™ interface more user-friendly and align OpenVINO™ with other frameworks. +The [migration guide](@ref ov_2_0_transition_guide) should allow to simplify the process of migration application from old API to OpenVINO™ API 2.0. diff --git a/docs/migration_ov_2_0/docs/common_inference_pipeline.md b/docs/migration_ov_2_0/docs/common_inference_pipeline.md new file mode 100644 index 00000000000..af2dbf25304 --- /dev/null +++ b/docs/migration_ov_2_0/docs/common_inference_pipeline.md @@ -0,0 +1,55 @@ +# OpenVINO™ Inference Pipeline {#ov_inference_pipeline} + +Usually to inference network with the OpenVINO™ toolkit users need to do next steps: + 1. Create Core + 2. (Optional) Read model from the disk + 2.1. Configure Input and Output of the Model + 3. Load the Model to the Device + 4. Create an Inference Request + 5. Prepare Input + 6. Start Inference + 7. Process the Inference Results + +Code snippets below cover these steps and show how application code should be changed for migration to OpenVINO™ 2.0. + +## 1. Create Core + +Inference Engine API: + +@snippet snippets/ie_common.cpp ie:create_core + +OpenVINO™ 2.0 API: + +@snippet snippets/ov_common.cpp ov_api_2_0:create_core + +## 2. (Optional) Read model from the disk + +Inference Engine API: + +@snippet snippets/ie_common.cpp ie:read_model + +OpenVINO™ 2.0 API: + +@snippet snippets/ov_common.cpp ov_api_2_0:read_model + +### 2.1 Configure Input and Output of the Model + +Inference Engine API: + +@snippet snippets/ie_common.cpp ie:get_inputs_outputs + +OpenVINO™ 2.0 API: + +@snippet snippets/ov_common.cpp ov_api_2_0:get_inputs_outputs + +## 3. Load the Model to the Device + +Inference Engine API: + +@snippet snippets/ie_common.cpp ie:compile_model + +OpenVINO™ 2.0 API: + +@snippet snippets/ov_common.cpp ov_api_2_0:compile_model + +## 5. TBD diff --git a/docs/migration_ov_2_0/docs/intro.md b/docs/migration_ov_2_0/docs/intro.md new file mode 100644 index 00000000000..5d89b7aff3d --- /dev/null +++ b/docs/migration_ov_2_0/docs/intro.md @@ -0,0 +1,12 @@ +# OpenVINO™ API 2.0 transition guide {#ov_2_0_transition_guide} + +The OpenVINO™ API 2.0 introduced in order to simplify migration from other frameworks and make the OpenVINO™ API more user-friendly. +The list with differences between APIs below: + + - OpenVINO™ API 2.0 uses tensor names or indexes to work with Inputs or Outputs, the old API works with operation names. + - Structures for Shapes, element types were changed. + - Naming style was changed. The old API uses CamelCaseStyle and OpenVINO™ API 2.0 uses snake_case for function names. + - Namespaces were aligned between components. + +Please look at next transition guides to understand how transit own application to OpenVINO™ API 2.0. + - [OpenVINO™ Common Inference pipeline](@ref ov_inference_pipeline) diff --git a/docs/snippets/ie_common.cpp b/docs/snippets/ie_common.cpp new file mode 100644 index 00000000000..6a558129243 --- /dev/null +++ b/docs/snippets/ie_common.cpp @@ -0,0 +1,43 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +int main() { + //! [ie:create_core] + InferenceEngine::Core core; + //! [ie:create_core] + + //! [ie:read_model] + InferenceEngine::CNNNetwork network = core.ReadNetwork("model.xml"); + //! [ie:read_model] + + //! [ie:get_inputs_outputs] + InferenceEngine::InputsDataMap inputs = network.getInputsInfo(); + InferenceEngine::OutputsDataMap outputs = network.getOutputsInfo(); + //! [ie:get_inputs_outputs] + + //! [ie:compile_model] + InferenceEngine::ExecutableNetwork exec_network = core.LoadNetwork(network, "CPU"); + //! [ie:compile_model] + + //! [ie:create_infer_request] + InferenceEngine::InferRequest infer_request = exec_network.CreateInferRequest(); + //! [ie:create_infer_request] + + //! [ie:get_input_tensor] + InferenceEngine::Blob::Ptr input_blob = infer_request.GetBlob(inputs.begin()->first); + // fill input blob + //! [ie:get_input_tensor] + + //! [ie:inference] + infer_request.Infer(); + //! [ie:inference] + + //! [ie:get_output_tensor] + InferenceEngine::Blob::Ptr output_blob = infer_request.GetBlob(outputs.begin()->first); + // process output data + //! [ie:get_output_tensor] + return 0; +} diff --git a/docs/snippets/ov_common.cpp b/docs/snippets/ov_common.cpp new file mode 100644 index 00000000000..7cb9e344f7c --- /dev/null +++ b/docs/snippets/ov_common.cpp @@ -0,0 +1,34 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include +#include + +int main() { + //! [ov_api_2_0:create_core] + ov::runtime::Core core; + //! [ov_api_2_0:create_core] + + //! [ov_api_2_0:read_model] + std::shared_ptr network = core.read_model("model.xml"); + //! [ov_api_2_0:read_model] + + //! [ov_api_2_0:get_inputs_outputs] + ov::ParameterVector inputs = network->get_parameters(); + ov::ResultVector outputs = network->get_results(); + //! [ov_api_2_0:get_inputs_outputs] + + //! [ov_api_2_0:compile_model] + ov::runtime::ExecutableNetwork exec_network = core.compile_model(network, "CPU"); + //! [ov_api_2_0:compile_model] + + ov::runtime::InferRequest infer_request = exec_network.create_infer_request(); + // + // InferenceEngine::Blob::Ptr input_blob = infer_request.GetBlob(inputs.begin()->first); + // // fill input blob + // infer_request.Infer(); + // + // InferenceEngine::Blob::Ptr output_blob = infer_request.GetBlob(outputs.begin()->first); + // process output data + return 0; +} diff --git a/inference-engine/src/inference_engine/include/openvino/runtime/runtime.hpp b/inference-engine/src/inference_engine/include/openvino/runtime/runtime.hpp new file mode 100644 index 00000000000..57b867b4663 --- /dev/null +++ b/inference-engine/src/inference_engine/include/openvino/runtime/runtime.hpp @@ -0,0 +1,15 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +/** + * @brief This is a header file for the OpenVINO Runtime Components + * + * @file openvino/runtime/runtime.hpp + */ +#pragma once + +#include "openvino/runtime/core.hpp" +#include "openvino/runtime/executable_network.hpp" +#include "openvino/runtime/infer_request.hpp" +#include "openvino/runtime/profiling_info.hpp" From 44186c31e4e2c942ae6cf0c5f4ff6cad765a316c Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Fri, 17 Sep 2021 05:20:28 +0300 Subject: [PATCH 02/51] Fixed path to setupvars.sh ti readme (#7537) --- tests/stress_tests/README.md | 2 +- tests/time_tests/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/stress_tests/README.md b/tests/stress_tests/README.md index b1258878640..78800463372 100644 --- a/tests/stress_tests/README.md +++ b/tests/stress_tests/README.md @@ -36,7 +36,7 @@ To build the tests, you need to have OpenVINO™ installed or build from source. Before build the tests, open a terminal, set OpenVINO™ environment, and after that run the commands below: ``` bash -source /bin/setupvars.sh +source /setupvars.sh mkdir build && cd build cmake .. && make -j$(nproc) ``` diff --git a/tests/time_tests/README.md b/tests/time_tests/README.md index 94e3566b7ee..447186ad706 100644 --- a/tests/time_tests/README.md +++ b/tests/time_tests/README.md @@ -15,7 +15,7 @@ the commands below: 1. Build tests: ``` bash -source /bin/setupvars.sh +source /setupvars.sh mkdir build && cd build cmake .. && make time_tests ``` From 8690e14a5b2f1fdb392cb84ef2109f22d20efd70 Mon Sep 17 00:00:00 2001 From: Anton Pankratv Date: Fri, 17 Sep 2021 10:26:23 +0300 Subject: [PATCH 03/51] Disabled TBB Executor (#7454) --- .../src/mkldnn_plugin/mkldnn_exec_network.cpp | 10 +++++----- .../cpu/shared_tests_instances/skip_tests_config.cpp | 3 ++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_exec_network.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_exec_network.cpp index 1209cc3f162..14fe27a187d 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_exec_network.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_exec_network.cpp @@ -13,11 +13,11 @@ #include "mkldnn_serialize.h" #include "nodes/mkldnn_memory_node.hpp" #include -#if ((IE_THREAD == IE_THREAD_TBB) || (IE_THREAD == IE_THREAD_TBB_AUTO)) +#define FIX_62820 0 +#if FIX_62820 && ((IE_THREAD == IE_THREAD_TBB) || (IE_THREAD == IE_THREAD_TBB_AUTO)) #include -#else -#include #endif +#include #include #include #include @@ -73,14 +73,14 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::CNNNetwork &network, } else { auto streamsExecutorConfig = InferenceEngine::IStreamsExecutor::Config::MakeDefaultMultiThreaded(_cfg.streamExecutorConfig, isFloatModel); streamsExecutorConfig._name = "CPUStreamsExecutor"; -#if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) +#if FIX_62820 && (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) _taskExecutor = std::make_shared(streamsExecutorConfig); #else _taskExecutor = ExecutorManager::getInstance()->getIdleCPUStreamsExecutor(streamsExecutorConfig); #endif } if (0 != cfg.streamExecutorConfig._streams) { -#if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) +#if FIX_62820 && (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) // There is no additional threads but we still need serialize callback execution to preserve legacy behaviour _callbackExecutor = std::make_shared(); #else diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp index c03822b7d20..37a21eb2106 100644 --- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp +++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp @@ -95,7 +95,8 @@ std::vector disabledTestPatterns() { R"(smoke_CachingSupportCase_CPU/LoadNetworkCacheTestBase.CompareWithRefImpl/ReadConcatSplitAssign_f32_batch1_CPU)" }; -#if ((IE_THREAD == IE_THREAD_TBB) || (IE_THREAD == IE_THREAD_TBB_AUTO)) +#define FIX_62820 0 +#if FIX_62820 && ((IE_THREAD == IE_THREAD_TBB) || (IE_THREAD == IE_THREAD_TBB_AUTO)) retVector.emplace_back(R"(.*ReusableCPUStreamsExecutor.*)"); #endif From 1f85d4230d453903beaf8ea8ab6aa0cf7bdbf675 Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Fri, 17 Sep 2021 11:04:50 +0300 Subject: [PATCH 04/51] Add `use_device_mem` option to benchmark_app (#7433) --- .../samples/benchmark_app/CMakeLists.txt | 37 ++++- .../samples/benchmark_app/benchmark_app.hpp | 18 +++ .../benchmark_app/infer_request_wrap.hpp | 4 + .../samples/benchmark_app/main.cpp | 12 +- .../benchmark_app/remote_blobs_filling.cpp | 140 ++++++++++++++++++ .../benchmark_app/remote_blobs_filling.hpp | 64 ++++++++ thirdparty/ocl/CMakeLists.txt | 41 ++++- 7 files changed, 313 insertions(+), 3 deletions(-) create mode 100644 inference-engine/samples/benchmark_app/remote_blobs_filling.cpp create mode 100644 inference-engine/samples/benchmark_app/remote_blobs_filling.hpp diff --git a/inference-engine/samples/benchmark_app/CMakeLists.txt b/inference-engine/samples/benchmark_app/CMakeLists.txt index b37495e5e43..e3412774f27 100644 --- a/inference-engine/samples/benchmark_app/CMakeLists.txt +++ b/inference-engine/samples/benchmark_app/CMakeLists.txt @@ -2,11 +2,46 @@ # SPDX-License-Identifier: Apache-2.0 # +set(TARGET_NAME "benchmark_app") + file (GLOB SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) file (GLOB HDR ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) -ie_add_sample(NAME benchmark_app +ie_add_sample(NAME ${TARGET_NAME} SOURCES ${SRC} HEADERS ${HDR} DEPENDENCIES format_reader ie_samples_utils OPENCV_DEPENDENCIES core) + +find_package(OpenCL) + +find_path(OpenCL_HPP_INCLUDE_DIR + NAMES + CL/cl2.hpp OpenCL/cl2.hpp + HINTS + ${opencl_root_hints} + ENV "PROGRAMFILES(X86)" + ENV AMDAPPSDKROOT + ENV INTELOCLSDKROOT + ENV NVSDKCOMPUTE_ROOT + ENV CUDA_PATH + ENV ATISTREAMSDKROOT + ENV OCL_ROOT + PATH_SUFFIXES + include + OpenCL/common/inc + "AMD APP/include") + +if(OPENCL_HEADERS_DIR) + # Use OpenCL CPP headers from sources if present + set(OpenCL_HEADERS OPENCL_HEADERS_DIR) +elseif(OpenCL_HPP_INCLUDE_DIR) + # Append OpenCL CPP headers to C headers and use both + set(OpenCL_HEADERS OpenCL_INCLUDE_DIR OpenCL_HPP_INCLUDE_DIR) +endif() + +if(OpenCL_FOUND AND OpenCL_HEADERS) + target_link_libraries(${TARGET_NAME} PRIVATE OpenCL::OpenCL) + target_include_directories(${TARGET_NAME} PRIVATE ${OpenCL_HEADERS}) + target_compile_definitions(${TARGET_NAME} PRIVATE HAVE_GPU_DEVICE_MEM_SUPPORT) +endif() diff --git a/inference-engine/samples/benchmark_app/benchmark_app.hpp b/inference-engine/samples/benchmark_app/benchmark_app.hpp index c062cee1960..6395db4ff29 100644 --- a/inference-engine/samples/benchmark_app/benchmark_app.hpp +++ b/inference-engine/samples/benchmark_app/benchmark_app.hpp @@ -4,6 +4,10 @@ #pragma once +#if defined(HAVE_GPU_DEVICE_MEM_SUPPORT) +# define HAVE_DEVICE_MEM_SUPPORT +#endif + #include #include @@ -132,6 +136,12 @@ static const char progress_message[] = // @brief message for performance counters option static const char pc_message[] = "Optional. Report performance counters."; +#ifdef HAVE_DEVICE_MEM_SUPPORT +// @brief message for switching memory allocation type option +static const char use_device_mem_message[] = + "Optional. Switch between host and device memory allocation for input and output buffers."; +#endif + #ifdef USE_OPENCV // @brief message for load config option static const char load_config_message[] = @@ -266,6 +276,11 @@ DEFINE_bool(progress, false, progress_message); /// @brief Define flag for showing performance counters
DEFINE_bool(pc, false, pc_message); +#ifdef HAVE_DEVICE_MEM_SUPPORT +/// @brief Define flag for switching beetwen host and device memory allocation for input and output buffers +DEFINE_bool(use_device_mem, false, use_device_mem_message); +#endif + #ifdef USE_OPENCV /// @brief Define flag for loading configuration file
DEFINE_string(load_config, "", load_config_message); @@ -339,6 +354,9 @@ static void showUsage() { std::cout << " -nthreads \"\" " << infer_num_threads_message << std::endl; std::cout << " -enforcebf16= " << enforce_bf16_message << std::endl; std::cout << " -pin \"YES\"/\"HYBRID_AWARE\"/\"NO\"/\"NUMA\" " << infer_threads_pinning_message << std::endl; +#ifdef HAVE_DEVICE_MEM_SUPPORT + std::cout << " -use_device_mem " << use_device_mem_message << std::endl; +#endif std::cout << std::endl << " Statistics dumping options:" << std::endl; std::cout << " -report_type \"\" " << report_type_message << std::endl; std::cout << " -report_folder " << report_folder_message << std::endl; diff --git a/inference-engine/samples/benchmark_app/infer_request_wrap.hpp b/inference-engine/samples/benchmark_app/infer_request_wrap.hpp index 5e15f597e7e..dd10a28eb7a 100644 --- a/inference-engine/samples/benchmark_app/infer_request_wrap.hpp +++ b/inference-engine/samples/benchmark_app/infer_request_wrap.hpp @@ -65,6 +65,10 @@ public: return _request.GetBlob(name); } + void setBlob(const std::string& name, const InferenceEngine::Blob::Ptr& data) { + _request.SetBlob(name, data); + } + double getExecutionTimeInMilliseconds() const { auto execTime = std::chrono::duration_cast(_endTime - _startTime); return static_cast(execTime.count()) * 0.000001; diff --git a/inference-engine/samples/benchmark_app/main.cpp b/inference-engine/samples/benchmark_app/main.cpp index 6c643d45486..9120ce7136e 100644 --- a/inference-engine/samples/benchmark_app/main.cpp +++ b/inference-engine/samples/benchmark_app/main.cpp @@ -21,6 +21,7 @@ #include "infer_request_wrap.hpp" #include "inputs_filling.hpp" #include "progress_bar.hpp" +#include "remote_blobs_filling.hpp" #include "statistics_report.hpp" #include "utils.hpp" @@ -592,7 +593,16 @@ int main(int argc, char* argv[]) { next_step(); InferRequestsQueue inferRequestsQueue(exeNetwork, nireq); - fillBlobs(inputFiles, batchSize, app_inputs_info, inferRequestsQueue.requests); + if (isFlagSetInCommandLine("use_device_mem")) { + if (device_name.find("GPU") == 0) + ::gpu::fillRemoteBlobs(inputFiles, batchSize, app_inputs_info, inferRequestsQueue.requests, exeNetwork); + else if (device_name.find("CPU") == 0) + fillBlobs(inputFiles, batchSize, app_inputs_info, inferRequestsQueue.requests); + else + IE_THROW() << "Requested device doesn't support `use_device_mem` option."; + } else { + fillBlobs(inputFiles, batchSize, app_inputs_info, inferRequestsQueue.requests); + } // ----------------- 10. Measuring performance // ------------------------------------------------------------------ diff --git a/inference-engine/samples/benchmark_app/remote_blobs_filling.cpp b/inference-engine/samples/benchmark_app/remote_blobs_filling.cpp new file mode 100644 index 00000000000..dc6d9fbf34a --- /dev/null +++ b/inference-engine/samples/benchmark_app/remote_blobs_filling.cpp @@ -0,0 +1,140 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "remote_blobs_filling.hpp" + +#include +#include +#include +#include + +namespace gpu { + +template +using uniformDistribution = typename std::conditional< + std::is_floating_point::value, + std::uniform_real_distribution, + typename std::conditional::value, std::uniform_int_distribution, void>::type>::type; + +template +void fillBufferRandom(void* inputBuffer, + size_t elementsNum, + T rand_min = std::numeric_limits::min(), + T rand_max = std::numeric_limits::max()) { + std::mt19937 gen(0); + uniformDistribution distribution(rand_min, rand_max); + auto inputBufferData = static_cast(inputBuffer); + for (size_t i = 0; i < elementsNum; i++) { + inputBufferData[i] = static_cast(distribution(gen)); + } +} + +void fillBuffer(void* inputBuffer, size_t elementsNum, InferenceEngine::Precision precision) { + if (precision == InferenceEngine::Precision::FP32) { + fillBufferRandom(inputBuffer, elementsNum); + } else if (precision == InferenceEngine::Precision::FP16) { + fillBufferRandom(inputBuffer, elementsNum); + } else if (precision == InferenceEngine::Precision::I32) { + fillBufferRandom(inputBuffer, elementsNum); + } else if (precision == InferenceEngine::Precision::I64) { + fillBufferRandom(inputBuffer, elementsNum); + } else if (precision == InferenceEngine::Precision::U8) { + // uniform_int_distribution is not allowed in the C++17 + // standard and vs2017/19 + fillBufferRandom(inputBuffer, elementsNum); + } else if (precision == InferenceEngine::Precision::I8) { + // uniform_int_distribution is not allowed in the C++17 standard + // and vs2017/19 + fillBufferRandom(inputBuffer, elementsNum); + } else if (precision == InferenceEngine::Precision::U16) { + fillBufferRandom(inputBuffer, elementsNum); + } else if (precision == InferenceEngine::Precision::I16) { + fillBufferRandom(inputBuffer, elementsNum); + } else if (precision == InferenceEngine::Precision::BOOL) { + fillBufferRandom(inputBuffer, elementsNum, 0, 1); + } else { + IE_THROW() << "Requested precision is not supported"; + } +} + +size_t getBytesPerElement(InferenceEngine::Precision precision) { + switch (precision) { + case InferenceEngine::Precision::FP32: + return 4; + case InferenceEngine::Precision::FP16: + return 2; + case InferenceEngine::Precision::I32: + return 4; + case InferenceEngine::Precision::I64: + return 8; + case InferenceEngine::Precision::U8: + return 1; + case InferenceEngine::Precision::I8: + return 1; + case InferenceEngine::Precision::U16: + return 2; + case InferenceEngine::Precision::I16: + return 2; + case InferenceEngine::Precision::BOOL: + return 1; + default: + IE_THROW() << "Requested precision is not supported"; + } +} + +void fillRemoteBlobs(const std::vector& inputFiles, + const size_t& batchSize, + benchmark_app::InputsInfo& app_inputs_info, + std::vector requests, + const InferenceEngine::ExecutableNetwork& exeNetwork) { +#ifdef HAVE_DEVICE_MEM_SUPPORT + slog::info << "Device memory will be used for input and output blobs" << slog::endl; + if (inputFiles.size()) { + slog::warn << "Device memory supports only random data at this moment, input images will be ignored" + << slog::endl; + } + auto context = exeNetwork.GetContext(); + auto oclContext = std::dynamic_pointer_cast(context)->get(); + auto oclInstance = std::make_shared(oclContext); + + auto setShared = [&](size_t requestId, + const std::string name, + const InferenceEngine::TensorDesc& desc, + bool fillRandom = false) { + cl_int err; + auto inputDims = desc.getDims(); + auto elementsNum = std::accumulate(begin(inputDims), end(inputDims), 1, std::multiplies()); + auto inputSize = elementsNum * getBytesPerElement(desc.getPrecision()); + + cl::Buffer sharedBuffer = + cl::Buffer(oclInstance->_context, CL_MEM_READ_WRITE, (cl::size_type)inputSize, NULL, &err); + + if (fillRandom) { + void* mappedPtr = oclInstance->_queue.enqueueMapBuffer(sharedBuffer, + CL_TRUE, + CL_MEM_READ_WRITE, + 0, + (cl::size_type)inputSize); + fillBuffer(mappedPtr, elementsNum, desc.getPrecision()); + oclInstance->_queue.enqueueUnmapMemObject(sharedBuffer, mappedPtr); + } + + InferenceEngine::Blob::Ptr sharedBlob = InferenceEngine::gpu::make_shared_blob(desc, context, sharedBuffer); + + requests.at(requestId)->setBlob(name, sharedBlob); + }; + + for (size_t requestId = 0; requestId < requests.size(); requestId++) { + for (auto& item : exeNetwork.GetInputsInfo()) + setShared(requestId, item.first, item.second->getTensorDesc(), true); + + for (auto& item : exeNetwork.GetOutputsInfo()) + setShared(requestId, item.first, item.second->getTensorDesc()); + } +#else + IE_THROW() << "Device memory requested for GPU device, but OpenCL was not linked"; +#endif +} + +} // namespace gpu diff --git a/inference-engine/samples/benchmark_app/remote_blobs_filling.hpp b/inference-engine/samples/benchmark_app/remote_blobs_filling.hpp new file mode 100644 index 00000000000..66e2b1b2c66 --- /dev/null +++ b/inference-engine/samples/benchmark_app/remote_blobs_filling.hpp @@ -0,0 +1,64 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#if defined(HAVE_GPU_DEVICE_MEM_SUPPORT) +# define HAVE_DEVICE_MEM_SUPPORT +# include +#endif + +#include + +#include "infer_request_wrap.hpp" +#include "utils.hpp" + +namespace gpu { + +#ifdef HAVE_DEVICE_MEM_SUPPORT +struct OpenCL { + cl::Context _context; + cl::Device _device; + cl::CommandQueue _queue; + + explicit OpenCL(std::shared_ptr> media_api_context_properties = nullptr) { + // get Intel GPU OCL device, create context and queue + { + std::vector devices; + std::vector platforms; + const unsigned int refVendorID = 0x8086; + + cl::Platform::get(&platforms); + for (auto& p : platforms) { + p.getDevices(CL_DEVICE_TYPE_GPU, &devices); + for (auto& d : devices) { + if (refVendorID == d.getInfo()) { + _device = d; + _context = cl::Context(_device); + break; + } + } + } + + cl_command_queue_properties props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE; + _queue = cl::CommandQueue(_context, _device, props); + } + } + + explicit OpenCL(cl_context context) { + // user-supplied context handle + _context = cl::Context(context, true); + _device = cl::Device(_context.getInfo()[0].get(), true); + + cl_command_queue_properties props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE; + _queue = cl::CommandQueue(_context, _device, props); + } +}; +#endif + +void fillRemoteBlobs(const std::vector& inputFiles, + const size_t& batchSize, + benchmark_app::InputsInfo& app_inputs_info, + std::vector requests, + const InferenceEngine::ExecutableNetwork& exeNetwork); + +} // namespace gpu diff --git a/thirdparty/ocl/CMakeLists.txt b/thirdparty/ocl/CMakeLists.txt index d091cc3f54d..64cb6859499 100644 --- a/thirdparty/ocl/CMakeLists.txt +++ b/thirdparty/ocl/CMakeLists.txt @@ -2,6 +2,40 @@ # SPDX-License-Identifier: Apache-2.0 # +if(NOT ENABLE_CLDNN) + return() +endif() + +function(get_lib_name TARGET_NAME LIBRARY_NAME) + if (CMAKE_BUILD_TYPE STREQUAL "Debug") + set(BUILD_SUFFIX ${IE_DEBUG_POSTFIX}) + else() + set(BUILD_SUFFIX ${IE_RELEASE_POSTFIX}) + endif() + + if(WIN32) + set(LIB_SUFFIX "${BUILD_SUFFIX}${CMAKE_LINK_LIBRARY_SUFFIX}") + else() + set(LIB_SUFFIX "${BUILD_SUFFIX}${CMAKE_SHARED_LIBRARY_SUFFIX}") + endif() + + set("${LIBRARY_NAME}" "${CMAKE_SHARED_MODULE_PREFIX}${TARGET_NAME}${LIB_SUFFIX}" PARENT_SCOPE) +endfunction() + +function(get_lib_path OUTPUT_DIR FINAL_OUTPUT_DIR) + if(WIN32) + set(LIB_DIR "") + else() + set(LIB_DIR "lib") + endif() + + if (NOT CMAKE_GENERATOR MATCHES "Ninja" AND NOT UNIX) + set(OUTPUT_DIR "${OUTPUT_DIR}/${CMAKE_BUILD_TYPE}") + endif() + + set("${FINAL_OUTPUT_DIR}" "${OUTPUT_DIR}/${LIB_DIR}" PARENT_SCOPE) +endfunction() + set(OPENCL_ICD_LOADER_HEADERS_DIR "${CMAKE_CURRENT_SOURCE_DIR}/cl_headers/" CACHE PATH "Path to OCL includes" FORCE) set(OPENCL_HEADERS_DIR "${CMAKE_CURRENT_SOURCE_DIR}/cl_headers/" "${CMAKE_CURRENT_SOURCE_DIR}/clhpp_headers/include/" CACHE PATH "Path to OCL (CL and CLHPP) includes" FORCE) @@ -20,5 +54,10 @@ target_include_directories(OpenCL SYSTEM PUBLIC ${OPENCL_HEADERS_DIR}) # The following varables are needed to make find_package(OpenCL) work set(OpenCL_VERSION_STRING "2.2" CACHE STRING "" FORCE) -set(OpenCL_INCLUDE_DIR "${OPENCL_ICD_LOADER_HEADERS_DIR}" CACHE PATH "" FORCE) +set(OpenCL_INCLUDE_DIR "${OPENCL_HEADERS_DIR}" CACHE PATH "" FORCE) set(OPENCLROOT "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}" CACHE PATH "" FORCE) + +get_lib_path("${OUTPUT_ROOT}/${BIN_FOLDER}" OPENCL_LIB_DIR) +get_lib_name("OpenCL" OPENCL_LIB_NAME) + +set(OpenCL_LIBRARY "${OPENCL_LIB_DIR}/${OPENCL_LIB_NAME}" CACHE PATH "" FORCE) From ac8db25864c6c3c41a48411298fb84fc8c5b1b9f Mon Sep 17 00:00:00 2001 From: Yuan Hu Date: Fri, 17 Sep 2021 16:17:54 +0800 Subject: [PATCH 05/51] Enable CPU accelerate FIL in MULTI (#7380) * Enable CPU accelerate FIL in MULTI Signed-off-by: Hu, Yuan2 * add configure to device KEY_PERFORMANCE_HINT_NUM_REQUESTS Signed-off-by: Hu, Yuan2 --- .../multi_device_exec_network.cpp | 343 ++++++++++++++---- .../multi_device_exec_network.hpp | 39 +- .../src/multi_device/multi_device_plugin.cpp | 153 ++++++-- .../src/multi_device/multi_device_plugin.hpp | 8 +- .../behavior/infer_request/perf_counters.cpp | 11 + .../behavior/infer_request/config.cpp | 13 - .../behavior/infer_request/perf_counters.cpp | 11 + .../behavior/test_plugin.cpp | 8 +- .../behavior/config.cpp | 9 +- .../behavior/infer_request/callback.cpp | 13 +- .../behavior/infer_request/perf_counters.cpp | 12 + .../behavior/infer_request/wait.cpp | 10 +- .../behavior/preprocessing/set_preprocess.cpp | 10 +- .../behavior/test_plugin.cpp | 12 +- .../behavior/version.cpp | 8 +- 15 files changed, 543 insertions(+), 117 deletions(-) diff --git a/inference-engine/src/multi_device/multi_device_exec_network.cpp b/inference-engine/src/multi_device/multi_device_exec_network.cpp index b569a0cf40d..3a2a3673e14 100644 --- a/inference-engine/src/multi_device/multi_device_exec_network.cpp +++ b/inference-engine/src/multi_device/multi_device_exec_network.cpp @@ -11,16 +11,46 @@ #include #include - +#include "ie_icore.hpp" #include "ie_metric_helpers.hpp" #include #include "multi_device_exec_network.hpp" #include "multi_device_async_infer_request.hpp" #include "multi_device_plugin.hpp" +#include "ngraph/opsets/opset1.hpp" +#include "ngraph_ops/convolution_ie.hpp" +#include "ngraph_ops/deconvolution_ie.hpp" +#include "transformations/utils/utils.hpp" + // ------------------------------MultiDeviceExecutableNetwork---------------------------- namespace MultiDevicePlugin { - using namespace InferenceEngine; +using namespace InferenceEngine; + +namespace { +std::string GetNetworkPrecision(const InferenceEngine::CNNNetwork &network) { + auto nGraphFunc = network.getFunction(); + bool isINTModel = ngraph::op::util::has_op_with_type(nGraphFunc); + if (isINTModel) { + return METRIC_VALUE(INT8); + } + for (auto & node : nGraphFunc->get_ordered_ops()) { + if (std::dynamic_pointer_cast(node) || + std::dynamic_pointer_cast(node) || + std::dynamic_pointer_cast(node) || + std::dynamic_pointer_cast(node) || + std::dynamic_pointer_cast(node) || + std::dynamic_pointer_cast(node)) { + auto layerType = node->input(1).get_element_type().get_type_name(); + if (layerType == "f32") + return METRIC_VALUE(FP32); + if (layerType == "f16") + return METRIC_VALUE(FP16); + } + } + return METRIC_VALUE(FP32); +} +} // namespace thread_local MultiDeviceExecutableNetwork::WorkerInferRequest* MultiDeviceExecutableNetwork::_thisWorkerInferRequest = nullptr; // TODO: revert to the plain variable (see header file), when we moved to the next CentOS 8.x in our support matrix @@ -60,74 +90,215 @@ MultiDeviceExecutableNetwork::MultiDeviceExecutableNetwork(const DeviceMapGetMetric(METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS)).as(); - } catch (const InferenceEngine::Exception &iie) { - IE_THROW() - << "Every device used with the Multi-Device should " - << "support OPTIMAL_NUMBER_OF_INFER_REQUESTS ExecutableNetwork metric. " - << "Failed to query the metric for the " << device << " with error:" << iie.what(); - } - const auto numRequests = (_devicePriorities.end() == itNumRequests || - itNumRequests->numRequestsPerDevices == -1) ? optimalNum : itNumRequests->numRequestsPerDevices; - auto& workerRequests = _workerRequests[device]; - auto& idleWorkerRequests = _idleWorkerRequests[device]; - workerRequests.resize(numRequests); - _inferPipelineTasksDeviceSpecific[device] = std::unique_ptr>(new ThreadSafeQueue); - auto* idleWorkerRequestsPtr = &(idleWorkerRequests); - idleWorkerRequests.set_capacity(numRequests); - for (auto&& workerRequest : workerRequests) { - workerRequest._inferRequest = { network, network->CreateInferRequest() }; - auto* workerRequestPtr = &workerRequest; - IE_ASSERT(idleWorkerRequests.try_push(workerRequestPtr) == true); - workerRequest._inferRequest->SetCallback( - [workerRequestPtr, this, device, idleWorkerRequestsPtr] (std::exception_ptr exceptionPtr) mutable { - IdleGuard idleGuard{workerRequestPtr, *idleWorkerRequestsPtr}; - workerRequestPtr->_exceptionPtr = exceptionPtr; - { - auto capturedTask = std::move(workerRequestPtr->_task); - capturedTask(); - } - // try to return the request to the idle list (fails if the overall object destruction has began) - if (idleGuard.Release()->try_push(workerRequestPtr)) { - // let's try to pop a task, as we know there is at least one idle request, schedule if succeeded - // if no device-agnostic tasks, let's try pop the device specific task, schedule if succeeded - Task t; - if (_inferPipelineTasks.try_pop(t)) - ScheduleToWorkerInferRequest(std::move(t)); - else if (_inferPipelineTasksDeviceSpecific[device]->try_pop(t)) - ScheduleToWorkerInferRequest(std::move(t), device); - } - }); - } + GenerateWorkers(device, network); } } +void MultiDeviceExecutableNetwork::GenerateWorkers(const std::string& device, const SoExecutableNetworkInternal& executableNetwork) { + auto itNumRequests = std::find_if(_devicePriorities.cbegin(), _devicePriorities.cend(), + [&device](const DeviceInformation& d){ return d.deviceName == device;}); + unsigned int optimalNum = 0; + try { + optimalNum = executableNetwork->GetMetric(METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS)).as(); + } catch (const InferenceEngine::Exception &iie) { + IE_THROW() + << "Every device used with the Multi-Device should " + << "support OPTIMAL_NUMBER_OF_INFER_REQUESTS ExecutableNetwork metric. " + << "Failed to query the metric for the " << device << " with error:" << iie.what(); + } + const auto numRequests = (_devicePriorities.end() == itNumRequests || + itNumRequests->numRequestsPerDevices == -1) ? optimalNum : itNumRequests->numRequestsPerDevices; + auto& workerRequests = _workerRequests[device]; + auto& idleWorkerRequests = _idleWorkerRequests[device]; + workerRequests.resize(numRequests); + _inferPipelineTasksDeviceSpecific[device] = std::unique_ptr>(new ThreadSafeQueue); + auto* idleWorkerRequestsPtr = &(idleWorkerRequests); + idleWorkerRequests.set_capacity(numRequests); + for (auto&& workerRequest : workerRequests) { + workerRequest._inferRequest = { executableNetwork, executableNetwork->CreateInferRequest() }; + auto* workerRequestPtr = &workerRequest; + IE_ASSERT(idleWorkerRequests.try_push(workerRequestPtr) == true); + workerRequest._inferRequest->SetCallback( + [workerRequestPtr, this, device, idleWorkerRequestsPtr] (std::exception_ptr exceptionPtr) mutable { + IdleGuard idleGuard{workerRequestPtr, *idleWorkerRequestsPtr}; + workerRequestPtr->_exceptionPtr = exceptionPtr; + { + auto capturedTask = std::move(workerRequestPtr->_task); + capturedTask(); + } + // try to return the request to the idle list (fails if the overall object destruction has began) + if (idleGuard.Release()->try_push(workerRequestPtr)) { + // let's try to pop a task, as we know there is at least one idle request, schedule if succeeded + // if no device-agnostic tasks, let's try pop the device specific task, schedule if succeeded + Task t; + if (_inferPipelineTasks.try_pop(t)) + ScheduleToWorkerInferRequest(std::move(t)); + else if (_inferPipelineTasksDeviceSpecific[device]->try_pop(t)) + ScheduleToWorkerInferRequest(std::move(t), device); + } + }); + } +} + +MultiDeviceExecutableNetwork::MultiDeviceExecutableNetwork(const std::string& modelPath, + const InferenceEngine::CNNNetwork& network, + const std::vector& metaDevices, + const std::string& strDevices, + MultiDeviceInferencePlugin* plugin, + const bool needPerfCounters) + : _devicePriorities{metaDevices} + , _devicePrioritiesInitial{metaDevices} + , _needPerfCounters(needPerfCounters) + , _multiPlugin(plugin) + , _workModeIsAUTO(true) { + if (_multiPlugin->GetCore() == nullptr) { + IE_THROW() << "Please, work with MULTI device via InferencEngine::Core object"; + } + + if (modelPath.empty() && network.getFunction() == nullptr) { + IE_THROW() << "MULTI device supports just ngraph network representation"; + } + + _core = _multiPlugin->GetCore(); // shared_ptr that holds the Core + _config[MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES] = strDevices; + + std::vector needLoadDevices; + + // check if have cpu device + const auto CPUIter = std::find_if(metaDevices.begin(), metaDevices.end(), + [=](const DeviceInformation& d)->bool{return d.deviceName.find("CPU") != std::string::npos;}); + if (CPUIter != metaDevices.end()) { + _cpuDevice = *CPUIter; + _config.insert(_cpuDevice.config.begin(), _cpuDevice.config.end()); + needLoadDevices.push_back(_cpuDevice); + _cpuFuture = _cpuPromise.get_future(); + } + + // get accelerator device, like GPU + auto networkPrecision = GetNetworkPrecision(network); + _acceleratorDevice = _multiPlugin->SelectDevice(metaDevices, networkPrecision); + bool isAccelerator = + _acceleratorDevice.deviceName.find("CPU") == std::string::npos; + if (isAccelerator) { + _config.insert(_acceleratorDevice.config.begin(), _acceleratorDevice.config.end()); + needLoadDevices.push_back(_acceleratorDevice); + _acceleratorFuture = _acceleratorPromise.get_future(); + } + + if (needLoadDevices.size() == 0) { + IE_THROW() << "No device set"; + } + + // will not wait for loading accelerator network, + // so the executor can't be destroyed before finished the task, + // so use executor as a member of MultiDeviceExecutableNetwork. + _executor = InferenceEngine::ExecutorManager::getInstance()->getIdleCPUStreamsExecutor( + IStreamsExecutor::Config{"AutoDeviceAsyncLoad", + static_cast(std::thread::hardware_concurrency()) /* max possible #streams*/, + 1 /*single thread per stream*/, + IStreamsExecutor::ThreadBindingType::NONE}); + + for (auto& p : needLoadDevices) { + // initialize these containers firstly to avoid insert operation in threads + _idleWorkerRequests[p.deviceName]; + _workerRequests[p.deviceName]; + _inferPipelineTasksDeviceSpecific[p.deviceName] = NULL; + const auto device = p.deviceName; + const auto deviceConfig = p.config; + // will not wait for loading accelerator network, + // so some parameters need to be transferred by value. + _executor->run([&, modelPath, network, device, deviceConfig]() { + SoExecutableNetworkInternal executableNetwork; + if (!modelPath.empty()) { + executableNetwork = _core->LoadNetwork(modelPath, device, deviceConfig); + } else { + executableNetwork = _core->LoadNetwork(network, device, deviceConfig); + } + + GenerateWorkers(device, executableNetwork); + + if (device.find("CPU") == std::string::npos) { + _alreadyActualNetwork = true; + _acceleratorPromise.set_value(executableNetwork); + } else { + _cpuPromise.set_value(executableNetwork); + } + }); + } + + WaitFirstNetworkReady(); +} + +void MultiDeviceExecutableNetwork::WaitFirstNetworkReady() { + if (_alreadyActualNetwork) { + return; + } + if (_cpuFuture.valid() && _acceleratorFuture.valid()) { + try { + _networkFirstReady = _cpuFuture.get(); + } catch (const std::exception& e) { + printf("Warning: load network to CPU failed: %s\n", e.what()); + _networkActualNeeded = _acceleratorFuture.get(); + } + } else if (_acceleratorFuture.valid()) { // only accelerator is valid, like AUTO:GPU + _networkActualNeeded = _acceleratorFuture.get(); + } else if (_cpuFuture.valid()) { // only CPU is valid, like AUTO:CPU + _networkActualNeeded = _cpuFuture.get(); + } else { + IE_THROW() << "No device task available"; + } + + // if there is only one device or loading CPU device is failed, + // the ActualNetwork is already ok now. + if (!_acceleratorFuture.valid()) { + _alreadyActualNetwork = true; + } +} + +void MultiDeviceExecutableNetwork::WaitActualNetworkReady() const { + // Maybe different API will call this function, so add call once here + // for every MultiDeviceExecutableNetwork instance + std::call_once(_oc, [&] () { + if (_acceleratorFuture.valid()) { + _networkActualNeeded = _acceleratorFuture.get(); + } + }); +} + void MultiDeviceExecutableNetwork::ScheduleToWorkerInferRequest(Task inferPipelineTask, DeviceName preferred_device) { - auto devices = [&] { - std::lock_guard lock(_mutex); - return _devicePriorities; - }(); + std::vector devices; + // AUTO work mode + if (_workModeIsAUTO) { + if (!preferred_device.empty()) { + // the preferred_device should be the selected device in AUTO work mode + if (preferred_device != _acceleratorDevice.deviceName) { + IE_THROW(NotFound) << "The preferred_device should be the selected device"; + } + // if the device needed by customer is not ready, need to wait for it + WaitActualNetworkReady(); + devices.push_back(_acceleratorDevice); + } else { + // _acceleratorDevice could be the same as _cpuDevice, such as AUTO:CPU + if (_alreadyActualNetwork) { + devices.push_back(_acceleratorDevice); + } else { + devices.push_back(_cpuDevice); + } + } + } else { + devices = [&] { + std::lock_guard lock(_mutex); + return _devicePriorities; + }(); + } for (auto&& device : devices) { if (!preferred_device.empty() && (device.deviceName != preferred_device)) continue; - WorkerInferRequest* workerRequestPtr = nullptr; - NotBusyWorkerRequests& idleWorkerRequests = _idleWorkerRequests[device.deviceName]; - if (idleWorkerRequests.try_pop(workerRequestPtr)) { - IdleGuard idleGuard{workerRequestPtr, idleWorkerRequests}; - _thisWorkerInferRequest = workerRequestPtr; - { - auto capturedTask = std::move(inferPipelineTask); - capturedTask(); - } - idleGuard.Release(); + if (RunPipelineTask(inferPipelineTask, _idleWorkerRequests[device.deviceName], preferred_device)) { return; } } + // no vacant requests this time, storing the task to the respective queue if (!preferred_device.empty()) _inferPipelineTasksDeviceSpecific[preferred_device]->push(std::move(inferPipelineTask)); @@ -135,11 +306,35 @@ void MultiDeviceExecutableNetwork::ScheduleToWorkerInferRequest(Task inferPipeli _inferPipelineTasks.push(std::move(inferPipelineTask)); } +bool MultiDeviceExecutableNetwork::RunPipelineTask(Task& inferPipelineTask, + NotBusyWorkerRequests& idleWorkerRequests, + const DeviceName& preferred_device) { + WorkerInferRequest *workerRequestPtr = nullptr; + if (idleWorkerRequests.try_pop(workerRequestPtr)) { + IdleGuard idleGuard{workerRequestPtr, idleWorkerRequests}; + _thisWorkerInferRequest = workerRequestPtr; + { + auto capturedTask = std::move(inferPipelineTask); + capturedTask(); + } + idleGuard.Release(); + return true; + } + return false; +} + void MultiDeviceExecutableNetwork::run(Task inferPipelineTask) { ScheduleToWorkerInferRequest(std::move(inferPipelineTask), _thisPreferredDeviceName); } MultiDeviceExecutableNetwork::~MultiDeviceExecutableNetwork() { + // this is necessary to guarantee member destroyed after getting future + if (_workModeIsAUTO) { + WaitActualNetworkReady(); + // it's necessary to wait the loading network threads to stop here. + InferenceEngine::ExecutorManager::getInstance()->clear("AutoDeviceAsyncLoad"); + _executor.reset(); + } { std::lock_guard lock(_mutex); _devicePriorities.clear(); @@ -147,14 +342,19 @@ MultiDeviceExecutableNetwork::~MultiDeviceExecutableNetwork() { /* NOTE: The only threads that use `MultiDeviceExecutableNetwork` worker infer requests' threads. * But AsyncInferRequest destructor should wait for all asynchronous tasks by the request */ - for (auto&& networkValue : _networksPerDevice) { + for (auto&& idleWorker : _idleWorkerRequests) { // stop accepting any idle requests back (for re-scheduling) - _idleWorkerRequests.at(networkValue.first).set_capacity(0); + idleWorker.second.set_capacity(0); } _workerRequests.clear(); } std::shared_ptr MultiDeviceExecutableNetwork::GetContext() const { + if (_workModeIsAUTO) { + WaitActualNetworkReady(); + return _networkActualNeeded->GetContext(); + } + auto devices = [&] { std::lock_guard lock(_mutex); return _devicePriorities; @@ -177,6 +377,11 @@ InferenceEngine::IInferRequestInternal::Ptr MultiDeviceExecutableNetwork::Create auto num = _numRequestsCreated++; size_t sum = 0; InferenceEngine::SoIInferRequestInternal request_to_share_blobs_with; + + if (_workModeIsAUTO) { + return std::make_shared(networkInputs, networkOutputs, request_to_share_blobs_with); + } + // borrowing device-specific blobs from the underlying requests for the device-agnostic, user-facing requests // this allows to potentially save on the data-copy later (if the requests are scheduled in the same order) for (const auto& device : _devicePrioritiesInitial) { @@ -200,6 +405,10 @@ IInferRequestInternal::Ptr MultiDeviceExecutableNetwork::CreateInferRequest() { } void MultiDeviceExecutableNetwork::SetConfig(const std::map &config) { + if (_workModeIsAUTO) { + IE_THROW(NotImplemented); + } + auto priorities = config.find(MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES); if (priorities == config.end() || config.size() > 1) { IE_THROW() << "The only config supported for the Network's SetConfig is MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES"; @@ -252,6 +461,15 @@ InferenceEngine::Parameter MultiDeviceExecutableNetwork::GetConfig(const std::st } InferenceEngine::Parameter MultiDeviceExecutableNetwork::GetMetric(const std::string &name) const { + if (_workModeIsAUTO) { + // fixme: should we wait actual device? meanwhile it will block inference, how to fix? + if (_alreadyActualNetwork) { + WaitActualNetworkReady(); + return _networkActualNeeded->GetMetric(name); + } + return _networkFirstReady->GetMetric(name); + } + if (name == METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS)) { unsigned int res = 0u; for (auto n : _networksPerDevice) { @@ -284,5 +502,4 @@ InferenceEngine::Parameter MultiDeviceExecutableNetwork::GetMetric(const std::st IE_THROW() << "Unsupported Network metric: " << name; } } - } // namespace MultiDevicePlugin diff --git a/inference-engine/src/multi_device/multi_device_exec_network.hpp b/inference-engine/src/multi_device/multi_device_exec_network.hpp index 2fb6e9462a7..2fd86c63170 100644 --- a/inference-engine/src/multi_device/multi_device_exec_network.hpp +++ b/inference-engine/src/multi_device/multi_device_exec_network.hpp @@ -16,14 +16,21 @@ #include #include #include +#include +#include "ie_icore.hpp" #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) # include #endif + namespace MultiDevicePlugin { +class MultiDeviceInferencePlugin; + using DeviceName = std::string; +using NetworkFuture = std::future; +using NetworkPromise = std::promise; struct DeviceInformation { DeviceName deviceName; @@ -105,10 +112,16 @@ public: }; using NotBusyWorkerRequests = ThreadSafeBoundedQueue; - explicit MultiDeviceExecutableNetwork(const DeviceMap& networksPerDevice, + explicit MultiDeviceExecutableNetwork(const DeviceMap& networksPerDevice, const std::vector& networkDevices, const std::unordered_map& config, const bool needPerfCounters = false); + MultiDeviceExecutableNetwork(const std::string& modelPath, + const InferenceEngine::CNNNetwork& network, + const std::vector& metaDevices, + const std::string& strDevices, + MultiDeviceInferencePlugin* plugin, + const bool needPerfCounters = false); void SetConfig(const std::map &config) override; InferenceEngine::Parameter GetConfig(const std::string &name) const override; @@ -138,6 +151,30 @@ public: std::unordered_map _config; bool _needPerfCounters = false; std::atomic_size_t _numRequestsCreated = {0}; + +private: + void GenerateWorkers(const std::string& device, const InferenceEngine::SoExecutableNetworkInternal& executableNetwork); + void WaitActualNetworkReady() const; + void WaitFirstNetworkReady(); + static bool RunPipelineTask(InferenceEngine::Task& inferPipelineTask, + NotBusyWorkerRequests& idleWorkerRequests, + const DeviceName& preferred_device); + +private: + std::shared_ptr _core; + InferenceEngine::IStreamsExecutor::Ptr _executor; + MultiDeviceInferencePlugin* _multiPlugin; + InferenceEngine::SoExecutableNetworkInternal _networkFirstReady; + mutable InferenceEngine::SoExecutableNetworkInternal _networkActualNeeded; + NetworkFuture _cpuFuture; + NetworkPromise _cpuPromise; + mutable NetworkFuture _acceleratorFuture; + mutable NetworkPromise _acceleratorPromise; + mutable bool _alreadyActualNetwork = {false}; + bool _workModeIsAUTO = {false}; + DeviceInformation _cpuDevice; + DeviceInformation _acceleratorDevice; + mutable std::once_flag _oc; }; } // namespace MultiDevicePlugin diff --git a/inference-engine/src/multi_device/multi_device_plugin.cpp b/inference-engine/src/multi_device/multi_device_plugin.cpp index f63bcde8c4d..b4f6e3aa49e 100644 --- a/inference-engine/src/multi_device/multi_device_plugin.cpp +++ b/inference-engine/src/multi_device/multi_device_plugin.cpp @@ -219,34 +219,50 @@ IExecutableNetworkInternal::Ptr MultiDeviceInferencePlugin::LoadNetworkImpl(cons bool workModeAuto = workMode != fullConfig.end() && workMode->second == InferenceEngine::PluginConfigParams::YES; auto priorities = fullConfig.find(MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES); - // not found device priorities for -d AUTO use case - if (priorities == fullConfig.end()) { - if (workModeAuto) { - std::string allDevices; - auto availableDevices = GetCore()->GetAvailableDevices(); - if (availableDevices.empty()) { - IE_THROW(NotFound) << "No available device found"; - } - for (auto&& device : availableDevices) { - allDevices += device; - allDevices += ((device == availableDevices[availableDevices.size()-1]) ? "" : ","); - } - metaDevices = ParseMetaDevices(allDevices, fullConfig); - multiNetworkConfig.insert({MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES, allDevices}); - } else { - IE_THROW() << "KEY_MULTI_DEVICE_PRIORITIES key is not set for " << GetName() << " device"; + // if workMode is AUTO + if (workModeAuto) { + // check the configure and check if need to set PerfCounters configure to device + // and set filter configure + bool needPerfCounters = false; + std::map filterConfig; + CheckConfig(fullConfig, needPerfCounters, filterConfig); + // filter the device that supports filter configure + auto strDevices = GetDeviceList(fullConfig); + auto metaDevices = ParseMetaDevices(strDevices, fullConfig); + auto supportDevices = FilterDevice(metaDevices, filterConfig); + if (supportDevices.size() == 0) { + IE_THROW() << "there is no device support the configure"; } + // replace the configure with configure that auto want to pass to device + // and reset the strDevices to support devices + std::vector validConfigKey; + validConfigKey.push_back(PluginConfigParams::KEY_PERF_COUNT); + validConfigKey.push_back(PluginConfigParams::KEY_EXCLUSIVE_ASYNC_REQUESTS); + validConfigKey.push_back(PluginConfigParams::KEY_PERFORMANCE_HINT); + validConfigKey.push_back(PluginConfigParams::KEY_PERFORMANCE_HINT_NUM_REQUESTS); + strDevices = ""; + for (auto iter = supportDevices.begin(); iter != supportDevices.end(); iter++) { + std::map deviceConfig; + auto& configs = iter->config; + for (auto& config : configs) { + if (std::find(validConfigKey.begin(), validConfigKey.end(), config.first) != validConfigKey.end()) { + deviceConfig.insert({config.first, config.second}); + } + } + iter->config = deviceConfig; + strDevices = iter->deviceName; + strDevices += ((iter + 1) == supportDevices.end()) ? "" : ","; + } + + return std::make_shared(modelPath, network, supportDevices, strDevices, this, needPerfCounters); + } + + if (priorities == fullConfig.end()) { + IE_THROW() << "KEY_MULTI_DEVICE_PRIORITIES key is not set for " << GetName() << " device"; } else { // for use case -d MULTI:xPU or -d AUTO:xPU metaDevices = ParseMetaDevices(priorities->second, fullConfig); multiNetworkConfig.insert(*priorities); } - // check if it is -d AUTO or -d AUTO:xPU use case - if (workModeAuto) { - // select the device - auto device = SelectDevice(metaDevices, networkPrecision).deviceName; - // parse the config for the device - metaDevices = ParseMetaDevices(SelectDevice(metaDevices, networkPrecision).deviceName, fullConfig); - } DeviceMap executableNetworkPerDevice; std::mutex load_mutex; @@ -345,7 +361,6 @@ QueryNetworkResult MultiDeviceInferencePlugin::QueryNetwork(const CNNNetwork& return queryResult; } - DeviceInformation MultiDeviceInferencePlugin::SelectDevice(const std::vector& metaDevices, const std::string& networkPrecision) { if (metaDevices.empty()) { IE_THROW(NotFound) << "No available device to select in " << GetName() << " plugin"; @@ -466,4 +481,94 @@ DeviceInformation MultiDeviceInferencePlugin::SelectDevice(const std::vector& config) const { + std::string allDevices; + + auto deviceListConfig = config.find(MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES); + if (deviceListConfig == config.end()) { + auto deviceList = GetCore()->GetAvailableDevices(); + for (auto&& device : deviceList) { + allDevices += device; + allDevices += ((device == deviceList[deviceList.size()-1]) ? "" : ","); + } + } else { + allDevices = deviceListConfig->second; + } + + if (allDevices.empty()) { + IE_THROW() << "Please, check environment due to no supported devices can be used"; + } + + return allDevices; +} + +void MultiDeviceInferencePlugin::CheckConfig(const std::map& config, + bool& needPerfCounters, std::map& filterConfig) { + // TODO need to optimize this code, too much duplicated code + const auto perf_hints_configs = PerfHintsConfig::SupportedKeys(); + for (auto&& kvp : config) { + if (kvp.first.find("AUTO_") == 0) { + continue; + } else if (kvp.first == PluginConfigParams::KEY_PERF_COUNT) { + if (kvp.second == PluginConfigParams::YES) { + needPerfCounters = true; + filterConfig.insert({kvp.first, kvp.second}); + } else if (kvp.second == PluginConfigParams::NO) { + needPerfCounters = false; + } else { + IE_THROW() << "Unsupported config value: " << kvp.second + << " for key: " << kvp.first; + } + } else if (kvp.first == PluginConfigParams::KEY_EXCLUSIVE_ASYNC_REQUESTS) { + if (kvp.second == PluginConfigParams::YES || + kvp.second == PluginConfigParams::NO) { + continue; + } else { + IE_THROW() << "Unsupported config value: " << kvp.second + << " for key: " << kvp.first; + } + } else if (std::find(perf_hints_configs.begin(), perf_hints_configs.end(), kvp.first) != perf_hints_configs.end()) { + PerfHintsConfig::CheckConfigAndValue(kvp); + } else if (supported_configKeys.end() == std::find(supported_configKeys.begin(), supported_configKeys.end(), kvp.first)) { + IE_THROW() << "Unsupported config key: " << kvp.first; + } + } +} + +std::vector MultiDeviceInferencePlugin::FilterDevice(const std::vector& metaDevices, + const std::map& config) { + if (metaDevices.empty()) { + IE_THROW(NotFound) << "No available device to filter " << GetName() << " plugin"; + } + + if (config.size() == 0) { + return metaDevices; + } + + std::vector filterDevice; + for (auto&& item : metaDevices) { + bool support = true; + std::vector supportedMetrics = GetCore()->GetMetric(item.deviceName, METRIC_KEY(SUPPORTED_METRICS)); + if (std::find(supportedMetrics.begin(), supportedMetrics.end(), METRIC_KEY(SUPPORTED_CONFIG_KEYS)) != supportedMetrics.end()) { + std::vector supportKeys = GetCore()->GetMetric(item.deviceName, METRIC_KEY(SUPPORTED_CONFIG_KEYS)); + for (auto&& kvp : config) { + auto targetKey = std::find(supportKeys.begin(), supportKeys.end(), kvp.first); + // if device have the key, we think the device support it + if (targetKey != supportKeys.end()) { + continue; + } else { + support = false; + break; + } + } + } else { + support = false; + } + + if (support) { + filterDevice.push_back(item); + } + } + return filterDevice; +} } // namespace MultiDevicePlugin diff --git a/inference-engine/src/multi_device/multi_device_plugin.hpp b/inference-engine/src/multi_device/multi_device_plugin.hpp index 4021c5ec9e1..f6f0ed39809 100644 --- a/inference-engine/src/multi_device/multi_device_plugin.hpp +++ b/inference-engine/src/multi_device/multi_device_plugin.hpp @@ -36,6 +36,9 @@ public: std::vector ParseMetaDevices(const std::string & devicesRequestsCfg, const std::map & config) const; + std::string GetDeviceList(const std::map& config) const; + DeviceInformation SelectDevice(const std::vector& metaDevices, const std::string& networkPrecision = METRIC_VALUE(FP32)); + protected: std::map GetSupportedConfig(const std::map& config, const MultiDevicePlugin::DeviceName & deviceName) const; @@ -45,7 +48,10 @@ private: InferenceEngine::CNNNetwork network, const std::map& config, const std::string &networkPrecision = METRIC_VALUE(FP32)); - DeviceInformation SelectDevice(const std::vector& metaDevices, const std::string& networkPrecision = METRIC_VALUE(FP32)); + static void CheckConfig(const std::map& config, bool& needPerfCounters, + std::map& filterConfig); + std::vector FilterDevice(const std::vector& metaDevices, + const std::map& config); }; } // namespace MultiDevicePlugin diff --git a/inference-engine/tests/functional/plugin/conformance/test_runner/src/behavior/infer_request/perf_counters.cpp b/inference-engine/tests/functional/plugin/conformance/test_runner/src/behavior/infer_request/perf_counters.cpp index b7007dd16f3..a7bfe68707c 100644 --- a/inference-engine/tests/functional/plugin/conformance/test_runner/src/behavior/infer_request/perf_counters.cpp +++ b/inference-engine/tests/functional/plugin/conformance/test_runner/src/behavior/infer_request/perf_counters.cpp @@ -18,6 +18,10 @@ const std::vector> MulticonfigsPerfCounters = {{ MULTI_CONFIG_KEY(DEVICE_PRIORITIES), targetDevice }} }; +const std::vector> AutoconfigsPerfCounters = { + {{ MULTI_CONFIG_KEY(DEVICE_PRIORITIES), targetDevice }} +}; + INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, InferRequestPerfCountersTest, ::testing::Combine( ::testing::Values(targetDevice), @@ -30,4 +34,11 @@ INSTANTIATE_TEST_SUITE_P(smoke_Multi_BehaviorTests, InferRequestPerfCountersTest ::testing::ValuesIn(MulticonfigsPerfCounters)), InferRequestPerfCountersTest::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_Auto_BehaviorTests, InferRequestPerfCountersTest, + ::testing::Combine( + ::testing::Values(CommonTestUtils::DEVICE_AUTO), + ::testing::ValuesIn(AutoconfigsPerfCounters)), + InferRequestPerfCountersTest::getTestCaseName); + + } // namespace diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/behavior/infer_request/config.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/behavior/infer_request/config.cpp index 7013c3096dd..c289a5831c0 100644 --- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/behavior/infer_request/config.cpp +++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/behavior/infer_request/config.cpp @@ -62,18 +62,5 @@ namespace { ::testing::ValuesIn(MultiInConfigs)), InferRequestConfigTest::getTestCaseName); - INSTANTIATE_TEST_SUITE_P(smoke_Auto_BehaviorTests, InferRequestConfigTest, - ::testing::Combine( - ::testing::Values(1u), - ::testing::Values(CommonTestUtils::DEVICE_AUTO), - ::testing::ValuesIn(multiConfigs)), - InferRequestConfigTest::getTestCaseName); - - INSTANTIATE_TEST_SUITE_P(smoke_Auto_BehaviorTests_, InferRequestConfigTest, - ::testing::Combine( - ::testing::Values(1u), - ::testing::Values(CommonTestUtils::DEVICE_AUTO), - ::testing::ValuesIn(MultiInConfigs)), - InferRequestConfigTest::getTestCaseName); } // namespace diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/behavior/infer_request/perf_counters.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/behavior/infer_request/perf_counters.cpp index 684f1938b37..b75a7e8c789 100644 --- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/behavior/infer_request/perf_counters.cpp +++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/behavior/infer_request/perf_counters.cpp @@ -37,6 +37,10 @@ const std::vector> Multiconfigs = { {{ MULTI_CONFIG_KEY(DEVICE_PRIORITIES) , CommonTestUtils::DEVICE_CPU}} }; +const std::vector> Autoconfigs = { + {{ MULTI_CONFIG_KEY(DEVICE_PRIORITIES) , CommonTestUtils::DEVICE_CPU}} +}; + INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, InferRequestPerfCountersTest, ::testing::Combine( ::testing::Values(CommonTestUtils::DEVICE_CPU), @@ -48,4 +52,11 @@ INSTANTIATE_TEST_SUITE_P(smoke_Multi_BehaviorTests, InferRequestPerfCountersTest ::testing::Values(CommonTestUtils::DEVICE_MULTI), ::testing::ValuesIn(Multiconfigs)), InferRequestPerfCountersTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Auto_BehaviorTests, InferRequestPerfCountersTest, + ::testing::Combine( + ::testing::Values(CommonTestUtils::DEVICE_AUTO), + ::testing::ValuesIn(Autoconfigs)), + InferRequestPerfCountersTest::getTestCaseName); + } // namespace diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/behavior/test_plugin.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/behavior/test_plugin.cpp index c03c1a4f121..f7656b81c76 100644 --- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/behavior/test_plugin.cpp +++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/behavior/test_plugin.cpp @@ -32,6 +32,10 @@ namespace { {InferenceEngine::PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS, InferenceEngine::PluginConfigParams::CPU_THROUGHPUT_AUTO}} }; + const std::vector> AutoConfigsInputOutput = { + {{InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES , CommonTestUtils::DEVICE_CPU}} + }; + const std::vector> configsOutput = { {}, {{InferenceEngine::PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS, InferenceEngine::PluginConfigParams::CPU_THROUGHPUT_AUTO}} @@ -56,7 +60,7 @@ namespace { ::testing::Combine( ::testing::ValuesIn(netPrecisions), ::testing::Values(CommonTestUtils::DEVICE_AUTO), - ::testing::ValuesIn(MultiConfigsInputOutput)), + ::testing::ValuesIn(AutoConfigsInputOutput)), BehaviorTestOutput::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, BehaviorTests, @@ -98,7 +102,7 @@ namespace { ::testing::Combine( ::testing::ValuesIn(netPrecisions), ::testing::Values(CommonTestUtils::DEVICE_AUTO), - ::testing::ValuesIn(MultiConfigsInputOutput)), + ::testing::ValuesIn(AutoConfigsInputOutput)), BehaviorTestInput::getTestCaseName); } // namespace diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/config.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/config.cpp index b972a0e4f7c..c75aa903a21 100644 --- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/config.cpp +++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/config.cpp @@ -111,6 +111,13 @@ namespace { {InferenceEngine::PluginConfigParams::KEY_PERFORMANCE_HINT_NUM_REQUESTS, "1"}} }; + const std::vector> autoConfigs = { + {{InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES , CommonTestUtils::DEVICE_GPU}, + {InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES, + CommonTestUtils::DEVICE_GPU + std::string(",") + CommonTestUtils::DEVICE_CPU}} + }; + + INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, CorrectConfigAPITests, ::testing::Combine( ::testing::ValuesIn(netPrecisions), @@ -136,7 +143,7 @@ namespace { ::testing::Combine( ::testing::ValuesIn(netPrecisions), ::testing::Values(CommonTestUtils::DEVICE_AUTO), - ::testing::ValuesIn(multiconf)), + ::testing::ValuesIn(autoConfigs)), CorrectConfigAPITests::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, IncorrectConfigAPITests, diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/infer_request/callback.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/infer_request/callback.cpp index 90a22c2435c..dfaa591dd96 100644 --- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/infer_request/callback.cpp +++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/infer_request/callback.cpp @@ -14,6 +14,11 @@ const std::vector> multiConfigs = { {{ MULTI_CONFIG_KEY(DEVICE_PRIORITIES) , CommonTestUtils::DEVICE_GPU}} }; +const std::vector> autoConfigs = { + {{InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES , CommonTestUtils::DEVICE_GPU}, + {InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES , CommonTestUtils::DEVICE_GPU + std::string(",") + CommonTestUtils::DEVICE_CPU}} +}; + INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, InferRequestCallbackTests, ::testing::Combine( ::testing::Values(CommonTestUtils::DEVICE_GPU), @@ -27,8 +32,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_Multi_BehaviorTests, InferRequestCallbackTests, InferRequestCallbackTests::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_Auto_BehaviorTests, InferRequestCallbackTests, - ::testing::Combine( - ::testing::Values(CommonTestUtils::DEVICE_AUTO), - ::testing::ValuesIn(multiConfigs)), - InferRequestCallbackTests::getTestCaseName); + ::testing::Combine( + ::testing::Values(CommonTestUtils::DEVICE_AUTO), + ::testing::ValuesIn(autoConfigs)), + InferRequestCallbackTests::getTestCaseName); } // namespace diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/infer_request/perf_counters.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/infer_request/perf_counters.cpp index 5a4a5852c5a..bdb9cf90518 100644 --- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/infer_request/perf_counters.cpp +++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/infer_request/perf_counters.cpp @@ -14,6 +14,12 @@ namespace { {{ MULTI_CONFIG_KEY(DEVICE_PRIORITIES) , CommonTestUtils::DEVICE_GPU}} }; + const std::vector> AutoConfigs = { + {{InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES , CommonTestUtils::DEVICE_GPU}, + {InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES, + CommonTestUtils::DEVICE_GPU + std::string(",") + CommonTestUtils::DEVICE_CPU}} + }; + INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, InferRequestPerfCountersTest, ::testing::Combine( ::testing::Values(CommonTestUtils::DEVICE_GPU), @@ -26,4 +32,10 @@ namespace { ::testing::ValuesIn(Multiconfigs)), InferRequestPerfCountersTest::getTestCaseName); + INSTANTIATE_TEST_SUITE_P(smoke_Auto_BehaviorTests, InferRequestPerfCountersTest, + ::testing::Combine( + ::testing::Values(CommonTestUtils::DEVICE_AUTO), + ::testing::ValuesIn(AutoConfigs)), + InferRequestPerfCountersTest::getTestCaseName); + } // namespace diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/infer_request/wait.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/infer_request/wait.cpp index 07fe3ddd855..41da3069a87 100644 --- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/infer_request/wait.cpp +++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/infer_request/wait.cpp @@ -13,6 +13,12 @@ namespace { {{InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES, CommonTestUtils::DEVICE_GPU}} }; + const std::vector> autoConfigs = { + {{InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES , CommonTestUtils::DEVICE_GPU}, + {InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES , + CommonTestUtils::DEVICE_GPU + std::string(",") + CommonTestUtils::DEVICE_CPU}} + }; + INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, InferRequestWaitTests, ::testing::Combine( ::testing::Values(CommonTestUtils::DEVICE_GPU), @@ -28,7 +34,7 @@ namespace { INSTANTIATE_TEST_SUITE_P(smoke_Auto_BehaviorTests, InferRequestWaitTests, ::testing::Combine( ::testing::Values(CommonTestUtils::DEVICE_AUTO), - ::testing::ValuesIn(configs)), + ::testing::ValuesIn(autoConfigs)), InferRequestWaitTests::getTestCaseName); -} // namespace \ No newline at end of file +} // namespace diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/preprocessing/set_preprocess.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/preprocessing/set_preprocess.cpp index c6a17bb4bf7..950425675bc 100644 --- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/preprocessing/set_preprocess.cpp +++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/preprocessing/set_preprocess.cpp @@ -22,6 +22,12 @@ namespace { {{ InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES , CommonTestUtils::DEVICE_GPU}} }; + const std::vector> autoConfigs = { + {{InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES , CommonTestUtils::DEVICE_GPU}, + {InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES , + CommonTestUtils::DEVICE_GPU + std::string(",") + CommonTestUtils::DEVICE_CPU}} + }; + INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, InferRequestPreprocessTest, ::testing::Combine( ::testing::ValuesIn(netPrecisions), @@ -40,7 +46,7 @@ namespace { ::testing::Combine( ::testing::ValuesIn(netPrecisions), ::testing::Values(CommonTestUtils::DEVICE_AUTO), - ::testing::ValuesIn(multiConfigs)), + ::testing::ValuesIn(autoConfigs)), InferRequestPreprocessTest::getTestCaseName); const std::vector ioPrecisions = { @@ -85,4 +91,4 @@ namespace { ::testing::ValuesIn(configs)), InferRequestPreprocessDynamicallyInSetBlobTest::getTestCaseName); -} // namespace \ No newline at end of file +} // namespace diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/test_plugin.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/test_plugin.cpp index 98069d07303..bfe1d09c36b 100644 --- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/test_plugin.cpp +++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/test_plugin.cpp @@ -31,6 +31,12 @@ namespace { {{ MULTI_CONFIG_KEY(DEVICE_PRIORITIES) , CommonTestUtils::DEVICE_GPU}} }; + const std::vector> autoConfigs = { + {{InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES , CommonTestUtils::DEVICE_GPU}, + {InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES , + CommonTestUtils::DEVICE_GPU + std::string(",") + CommonTestUtils::DEVICE_CPU}} + }; + const std::vector> configsInput = { {}, {{InferenceEngine::PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS, InferenceEngine::PluginConfigParams::GPU_THROUGHPUT_AUTO}} @@ -65,7 +71,7 @@ namespace { ::testing::Combine( ::testing::ValuesIn(netPrecisions), ::testing::Values(CommonTestUtils::DEVICE_AUTO), - ::testing::ValuesIn(MultiConfigsInputOutput)), + ::testing::ValuesIn(autoConfigs)), BehaviorTestOutput::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, BehaviorTests, @@ -86,7 +92,7 @@ namespace { ::testing::Combine( ::testing::Values(InferenceEngine::Precision::FP32), ::testing::Values(CommonTestUtils::DEVICE_AUTO), - ::testing::ValuesIn(MultiConfigs)), + ::testing::ValuesIn(autoConfigs)), BehaviorTests::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, BehaviorTestInput, @@ -107,7 +113,7 @@ namespace { ::testing::Combine( ::testing::ValuesIn(netPrecisions), ::testing::Values(CommonTestUtils::DEVICE_AUTO), - ::testing::ValuesIn(MultiConfigsInputOutput)), + ::testing::ValuesIn(autoConfigs)), BehaviorTestInput::getTestCaseName); } // namespace diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/version.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/version.cpp index c02a209e9d5..fe7bbfa5c09 100644 --- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/version.cpp +++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/version.cpp @@ -14,6 +14,12 @@ namespace { {{ MULTI_CONFIG_KEY(DEVICE_PRIORITIES) , CommonTestUtils::DEVICE_GPU}} }; + const std::vector> autoConfigs = { + {{InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES , CommonTestUtils::DEVICE_GPU}, + {InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES , + CommonTestUtils::DEVICE_GPU + std::string(",") + CommonTestUtils::DEVICE_CPU}} + }; + const std::vector> Heteroconfigs = { {{ HETERO_CONFIG_KEY(DUMP_GRAPH_DOT) , CommonTestUtils::DEVICE_GPU}} }; @@ -36,7 +42,7 @@ namespace { ::testing::Combine( ::testing::Values(InferenceEngine::Precision::FP32), ::testing::Values(CommonTestUtils::DEVICE_AUTO), - ::testing::ValuesIn(Multiconfigs)), + ::testing::ValuesIn(autoConfigs)), VersionTest::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_Hetero_BehaviorTests, VersionTest, From a6bdb8744f30fea9c413d43cf6fb6129e5e537d2 Mon Sep 17 00:00:00 2001 From: Dmitry Pigasin Date: Fri, 17 Sep 2021 11:59:49 +0300 Subject: [PATCH 06/51] [IE Python Speech Sample] Enable --scale_factor for multiple input files (#7482) * Enable --scale_factor for multiple input files * Small refactoring of getting a first utterance * Fix mypy issue * Update readme Co-authored-by: Dmitry Pigasin --- .../python/sample/speech_sample/README.md | 30 ++++++----- .../python/sample/speech_sample/arg_parser.py | 5 +- .../sample/speech_sample/speech_sample.py | 51 +++++++++++++++---- 3 files changed, 62 insertions(+), 24 deletions(-) diff --git a/inference-engine/ie_bridges/python/sample/speech_sample/README.md b/inference-engine/ie_bridges/python/sample/speech_sample/README.md index 54403416bc4..2809cc72a5f 100644 --- a/inference-engine/ie_bridges/python/sample/speech_sample/README.md +++ b/inference-engine/ie_bridges/python/sample/speech_sample/README.md @@ -89,15 +89,15 @@ optional arguments: Path to an .xml file with a trained model (required if -rg is missing). -rg IMPORT_GNA_MODEL, --import_gna_model IMPORT_GNA_MODEL - Read GNA model from file using path/filename provided + Read GNA model from file using path/filename provided (required if -m is missing). Options: -h, --help Show this help message and exit. -i INPUT, --input INPUT - Required. Path to an input file (.ark or .npz). + Required. Path to an input file (.ark or .npz). -o OUTPUT, --output OUTPUT - Optional. Output file name to save inference results + Optional. Output file name to save inference results (.ark or .npz). -r REFERENCE, --reference REFERENCE Optional. Read reference score file and compare @@ -117,7 +117,8 @@ Options: (default 16). -sf SCALE_FACTOR, --scale_factor SCALE_FACTOR Optional. The user-specified input scale factor for - quantization. + quantization. If the network contains multiple inputs, + provide scale factors by separating them with commas. -wg EXPORT_GNA_MODEL, --export_gna_model EXPORT_GNA_MODEL Optional. Write GNA model to file using path/filename provided. @@ -176,27 +177,30 @@ The sample application logs each step in a standard output stream. [ INFO ] Creating Inference Engine [ INFO ] Reading the network: wsj_dnn5b.xml [ INFO ] Configuring input and output blobs -[ INFO ] Using scale factor of 2175.4322417 calculated from first utterance. +[ INFO ] Using scale factor(s) calculated from first utterance +[ INFO ] For input 0 using scale factor of 2175.4322418 [ INFO ] Loading the model to the plugin [ INFO ] Starting inference in synchronous mode [ INFO ] Utterance 0 (4k0c0301) +[ INFO ] Output blob name: affinetransform14/Fused_Add_ [ INFO ] Frames in utterance: 1294 -[ INFO ] Total time in Infer (HW and SW): 5305.47ms -[ INFO ] max error: 0.7051839 -[ INFO ] avg error: 0.0448387 -[ INFO ] avg rms error: 0.0582387 -[ INFO ] stdev error: 0.0371649 +[ INFO ] Total time in Infer (HW and SW): 6211.45ms +[ INFO ] max error: 0.7051840 +[ INFO ] avg error: 0.0448388 +[ INFO ] avg rms error: 0.0582387 +[ INFO ] stdev error: 0.0371650 [ INFO ] [ INFO ] Utterance 1 (4k0c0302) +[ INFO ] Output blob name: affinetransform14/Fused_Add_ [ INFO ] Frames in utterance: 1005 -[ INFO ] Total time in Infer (HW and SW): 5031.53ms +[ INFO ] Total time in Infer (HW and SW): 4742.27ms [ INFO ] max error: 0.7575974 [ INFO ] avg error: 0.0452166 [ INFO ] avg rms error: 0.0586013 [ INFO ] stdev error: 0.0372769 -[ INFO ] ... -[ INFO ] Total sample time: 38033.09ms +[ INFO ] Total sample time: 40219.99ms +[ INFO ] File result.npz was created! [ INFO ] This sample is an API example, for any performance measurements please use the dedicated benchmark_app tool ``` diff --git a/inference-engine/ie_bridges/python/sample/speech_sample/arg_parser.py b/inference-engine/ie_bridges/python/sample/speech_sample/arg_parser.py index 1d2ad5c7d71..d4e2b345ea2 100644 --- a/inference-engine/ie_bridges/python/sample/speech_sample/arg_parser.py +++ b/inference-engine/ie_bridges/python/sample/speech_sample/arg_parser.py @@ -28,8 +28,9 @@ def parse_args() -> argparse.Namespace: args.add_argument('-bs', '--batch_size', default=1, type=int, help='Optional. Batch size 1-8 (default 1).') args.add_argument('-qb', '--quantization_bits', default=16, type=int, help='Optional. Weight bits for quantization: 8 or 16 (default 16).') - args.add_argument('-sf', '--scale_factor', type=float, - help='Optional. The user-specified input scale factor for quantization.') + args.add_argument('-sf', '--scale_factor', type=str, + help='Optional. The user-specified input scale factor for quantization. ' + 'If the network contains multiple inputs, provide scale factors by separating them with commas.') args.add_argument('-wg', '--export_gna_model', type=str, help='Optional. Write GNA model to file using path/filename provided.') args.add_argument('-we', '--export_embedded_gna_model', type=str, help=argparse.SUPPRESS) diff --git a/inference-engine/ie_bridges/python/sample/speech_sample/speech_sample.py b/inference-engine/ie_bridges/python/sample/speech_sample/speech_sample.py index 67601e1f379..14d2e4fa441 100755 --- a/inference-engine/ie_bridges/python/sample/speech_sample/speech_sample.py +++ b/inference-engine/ie_bridges/python/sample/speech_sample/speech_sample.py @@ -103,6 +103,32 @@ def get_output_layer_list(net: Union[IENetwork, ExecutableNetwork], return [list(net.outputs.keys())[-1]] +def parse_scale_factors(args: argparse.Namespace) -> list: + """Get a list of scale factors for input files""" + input_files = re.split(', |,', args.input) + scale_factors = re.split(', |,', str(args.scale_factor)) + scale_factors = list(map(float, scale_factors)) + + if len(input_files) != len(scale_factors): + log.error(f'Incorrect command line for multiple inputs: {len(scale_factors)} scale factors provided for ' + f'{len(input_files)} input files.') + sys.exit(-7) + + for i, scale_factor in enumerate(scale_factors): + if float(scale_factor) < 0: + log.error(f'Scale factor for input #{i} (counting from zero) is out of range (must be positive).') + sys.exit(-8) + + return scale_factors + + +def set_scale_factors(plugin_config: dict, scale_factors: list): + """Set a scale factor provided for each input""" + for i, scale_factor in enumerate(scale_factors): + log.info(f'For input {i} using scale factor of {scale_factor:.7f}') + plugin_config[f'GNA_SCALE_FACTOR_{i}'] = str(scale_factor) + + def main(): log.basicConfig(format='[ %(levelname)s ] %(message)s', level=log.INFO, stream=sys.stdout) args = parse_args() @@ -149,16 +175,23 @@ def main(): # Set a GNA scale factor if args.import_gna_model: - log.info(f'Using scale factor from the imported GNA model: {args.import_gna_model}') - elif args.scale_factor: - log.info(f'Using scale factor of {args.scale_factor:.7f} specified by user.') - plugin_config['GNA_SCALE_FACTOR'] = str(args.scale_factor) + if args.scale_factor: + log.warning(f'Custom scale factor will be used for imported GNA model: {args.import_gna_model}') + set_scale_factors(plugin_config, parse_scale_factors(args)) + else: + log.info(f'Using scale factor from the imported GNA model: {args.import_gna_model}') else: - utterances = read_utterance_file(args.input.split(',')[0]) - key = sorted(utterances)[0] - scale_factor = get_scale_factor(utterances[key]) - log.info(f'Using scale factor of {scale_factor:.7f} calculated from first utterance.') - plugin_config['GNA_SCALE_FACTOR'] = str(scale_factor) + if args.scale_factor: + set_scale_factors(plugin_config, parse_scale_factors(args)) + else: + scale_factors = [] + + for file_name in re.split(', |,', args.input): + first_utterance = next(iter(read_utterance_file(file_name).values())) + scale_factors.append(get_scale_factor(first_utterance)) + + log.info('Using scale factor(s) calculated from first utterance') + set_scale_factors(plugin_config, scale_factors) if args.export_embedded_gna_model: plugin_config['GNA_FIRMWARE_MODEL_IMAGE'] = args.export_embedded_gna_model From 660c106233657ce5b7ec97f505e41c121fe52ce8 Mon Sep 17 00:00:00 2001 From: Roman Lyamin Date: Fri, 17 Sep 2021 13:12:25 +0300 Subject: [PATCH 07/51] [GPU] Performance counters fix (#7143) --- .../src/cldnn_engine/cldnn_graph.cpp | 72 ++++--- .../src/cldnn_engine/cldnn_graph.h | 2 - .../src/cldnn_engine/cldnn_program.cpp | 3 - .../src/cldnn_engine/cldnn_program.h | 2 - .../src/cldnn_engine/ops/batch_to_space.cpp | 3 +- .../src/cldnn_engine/ops/broadcast.cpp | 15 +- .../src/cldnn_engine/ops/concat.cpp | 3 +- .../src/cldnn_engine/ops/constant.cpp | 2 +- .../src/cldnn_engine/ops/convert.cpp | 17 +- .../src/cldnn_engine/ops/convolution.cpp | 58 +++--- .../cldnn_engine/ops/ctc_greedy_decoder.cpp | 19 +- .../src/cldnn_engine/ops/cum_sum.cpp | 3 +- .../src/cldnn_engine/ops/custom.cpp | 13 +- .../src/cldnn_engine/ops/depth_to_space.cpp | 3 +- .../src/cldnn_engine/ops/detection_output.cpp | 3 +- .../src/cldnn_engine/ops/eltwise.cpp | 13 +- .../src/cldnn_engine/ops/embedding_bag.cpp | 25 ++- .../ops/extract_image_patches.cpp | 3 +- .../src/cldnn_engine/ops/fake_quantize.cpp | 3 +- .../src/cldnn_engine/ops/gather tree.cpp | 8 +- .../src/cldnn_engine/ops/gather.cpp | 8 +- .../src/cldnn_engine/ops/gather_elements.cpp | 3 +- .../src/cldnn_engine/ops/gather_nd.cpp | 9 +- inference-engine/src/cldnn_engine/ops/grn.cpp | 3 +- .../src/cldnn_engine/ops/interpolate.cpp | 3 +- .../src/cldnn_engine/ops/loop.cpp | 12 +- inference-engine/src/cldnn_engine/ops/lrn.cpp | 3 +- .../src/cldnn_engine/ops/matmul.cpp | 29 ++- inference-engine/src/cldnn_engine/ops/mvn.cpp | 3 +- .../cldnn_engine/ops/non_max_suppression.cpp | 31 ++- .../src/cldnn_engine/ops/normalize_l2.cpp | 5 +- .../src/cldnn_engine/ops/one_hot.cpp | 3 +- inference-engine/src/cldnn_engine/ops/pad.cpp | 3 +- .../src/cldnn_engine/ops/parameter.cpp | 43 +++-- .../src/cldnn_engine/ops/pooling.cpp | 6 +- .../src/cldnn_engine/ops/prior_box.cpp | 6 +- .../src/cldnn_engine/ops/proposal.cpp | 17 +- .../src/cldnn_engine/ops/reduce.cpp | 13 +- .../src/cldnn_engine/ops/region_yolo.cpp | 3 +- .../src/cldnn_engine/ops/reorg_yolo.cpp | 3 +- .../src/cldnn_engine/ops/reshape.cpp | 11 +- .../src/cldnn_engine/ops/result.cpp | 9 +- .../src/cldnn_engine/ops/reverse_sequence.cpp | 3 +- inference-engine/src/cldnn_engine/ops/rnn.cpp | 101 ++++++---- .../src/cldnn_engine/ops/roi_pooling.cpp | 12 +- .../ops/scatter_elements_update.cpp | 9 +- .../cldnn_engine/ops/scatter_nd_update.cpp | 9 +- .../src/cldnn_engine/ops/scatter_update.cpp | 3 +- .../src/cldnn_engine/ops/select.cpp | 11 +- .../src/cldnn_engine/ops/shuffle_channels.cpp | 3 +- .../src/cldnn_engine/ops/softmax.cpp | 8 +- .../src/cldnn_engine/ops/space_to_batch.cpp | 3 +- .../src/cldnn_engine/ops/space_to_depth.cpp | 3 +- .../src/cldnn_engine/ops/split.cpp | 3 +- .../src/cldnn_engine/ops/strided_slice.cpp | 9 +- .../src/cldnn_engine/ops/tensor_iterator.cpp | 20 +- .../src/cldnn_engine/ops/tile.cpp | 3 +- .../src/cldnn_engine/ops/topk.cpp | 13 +- .../src/cldnn_engine/ops/transpose.cpp | 3 +- .../src/cldnn_engine/ops/unary.cpp | 8 +- .../nms_transformation_for_last_node.cpp | 13 ++ .../remove_parameter.cpp | 16 ++ .../clDNN/api/cldnn/graph/network.hpp | 1 + .../clDNN/api/cldnn/primitives/activation.hpp | 6 +- .../api/cldnn/primitives/arg_max_min.hpp | 3 +- .../cldnn/primitives/average_unpooling.hpp | 3 +- .../api/cldnn/primitives/batch_to_space.hpp | 3 +- .../cldnn/primitives/binary_convolution.hpp | 3 +- .../clDNN/api/cldnn/primitives/border.hpp | 5 +- .../clDNN/api/cldnn/primitives/broadcast.hpp | 3 +- .../api/cldnn/primitives/concatenation.hpp | 6 +- .../clDNN/api/cldnn/primitives/condition.hpp | 3 +- .../api/cldnn/primitives/convolution.hpp | 52 +++-- .../clDNN/api/cldnn/primitives/crop.hpp | 9 +- .../cldnn/primitives/ctc_greedy_decoder.hpp | 3 +- .../clDNN/api/cldnn/primitives/cum_sum.hpp | 3 +- .../cldnn/primitives/custom_gpu_primitive.hpp | 5 +- .../clDNN/api/cldnn/primitives/data.hpp | 4 +- .../api/cldnn/primitives/deconvolution.hpp | 25 ++- .../api/cldnn/primitives/depth_to_space.hpp | 3 +- .../api/cldnn/primitives/detection_output.hpp | 3 +- .../clDNN/api/cldnn/primitives/eltwise.hpp | 15 +- .../api/cldnn/primitives/embedding_bag.hpp | 3 +- .../primitives/extract_image_patches.hpp | 3 +- .../api/cldnn/primitives/fully_connected.hpp | 24 +-- .../cldnn/primitives/fused_conv_eltwise.hpp | 3 +- .../clDNN/api/cldnn/primitives/gather.hpp | 3 +- .../api/cldnn/primitives/gather_elements.hpp | 3 +- .../clDNN/api/cldnn/primitives/gather_nd.hpp | 13 +- .../api/cldnn/primitives/gather_tree.hpp | 13 +- .../clDNN/api/cldnn/primitives/gemm.hpp | 3 +- .../clDNN/api/cldnn/primitives/grn.hpp | 3 +- .../api/cldnn/primitives/input_layout.hpp | 4 +- .../clDNN/api/cldnn/primitives/loop.hpp | 27 +-- .../clDNN/api/cldnn/primitives/lrn.hpp | 3 +- .../clDNN/api/cldnn/primitives/lstm.hpp | 9 +- .../api/cldnn/primitives/lstm_dynamic.hpp | 3 +- .../cldnn/primitives/lstm_dynamic_input.hpp | 3 +- .../primitives/lstm_dynamic_timeloop.hpp | 3 +- .../api/cldnn/primitives/max_unpooling.hpp | 6 +- .../api/cldnn/primitives/mutable_data.hpp | 10 +- .../clDNN/api/cldnn/primitives/mvn.hpp | 3 +- .../cldnn/primitives/non_max_suppression.hpp | 5 +- .../clDNN/api/cldnn/primitives/normalize.hpp | 3 +- .../clDNN/api/cldnn/primitives/one_hot.hpp | 6 +- .../clDNN/api/cldnn/primitives/permute.hpp | 3 +- .../clDNN/api/cldnn/primitives/pooling.hpp | 16 +- .../clDNN/api/cldnn/primitives/primitive.hpp | 8 +- .../clDNN/api/cldnn/primitives/prior_box.hpp | 6 +- .../clDNN/api/cldnn/primitives/proposal.hpp | 9 +- .../cldnn/primitives/pyramid_roi_align.hpp | 16 +- .../clDNN/api/cldnn/primitives/quantize.hpp | 3 +- .../clDNN/api/cldnn/primitives/reduce.hpp | 11 +- .../api/cldnn/primitives/region_yolo.hpp | 3 +- .../clDNN/api/cldnn/primitives/reorder.hpp | 42 ++-- .../clDNN/api/cldnn/primitives/reorg_yolo.hpp | 3 +- .../clDNN/api/cldnn/primitives/resample.hpp | 9 +- .../clDNN/api/cldnn/primitives/reshape.hpp | 3 +- .../api/cldnn/primitives/reverse_sequence.hpp | 3 +- .../api/cldnn/primitives/roi_pooling.hpp | 6 +- .../clDNN/api/cldnn/primitives/scale.hpp | 6 +- .../primitives/scatter_elements_update.hpp | 13 +- .../cldnn/primitives/scatter_nd_update.hpp | 13 +- .../api/cldnn/primitives/scatter_update.hpp | 3 +- .../clDNN/api/cldnn/primitives/select.hpp | 3 +- .../api/cldnn/primitives/shuffle_channels.hpp | 3 +- .../clDNN/api/cldnn/primitives/softmax.hpp | 3 +- .../api/cldnn/primitives/space_to_batch.hpp | 3 +- .../api/cldnn/primitives/space_to_depth.hpp | 3 +- .../clDNN/api/cldnn/primitives/split.hpp | 3 +- .../api/cldnn/primitives/strided_slice.hpp | 3 +- .../clDNN/api/cldnn/primitives/tile.hpp | 3 +- .../graph_optimizer/graph_initializations.cpp | 2 + .../graph_optimizer/pre_replace_deconv.cpp | 3 + .../graph_optimizer/prepare_quantization.cpp | 1 + .../clDNN/src/include/generic_layer.hpp | 3 +- .../clDNN/src/include/primitive_inst.h | 1 + .../clDNN/src/include/program_node.h | 2 + .../thirdparty/clDNN/src/network.cpp | 15 ++ .../thirdparty/clDNN/src/program.cpp | 1 + .../test_cases/activation_simple_gpu_test.cpp | 14 +- .../tests/test_cases/arg_max_gpu_test.cpp | 18 +- .../test_cases/average_unpooling_gpu_test.cpp | 2 +- .../binary_convolution_gpu_test.cpp | 2 + .../test_cases/concatenation_gpu_test.cpp | 5 + .../tests/test_cases/convolution_gpu_test.cpp | 19 +- .../tests/test_cases/eltwise_gpu_test.cpp | 10 +- .../tests/test_cases/fusings_gpu_test.cpp | 13 +- .../test_cases/max_unpooling_gpu_test.cpp | 8 +- .../tests/test_cases/pooling_gpu_test.cpp | 94 ++++----- .../tests/test_cases/proposal_cpu_test.cpp | 1 + .../tests/test_cases/reorder_gpu_test.cpp | 179 +++++++++--------- .../tests/test_cases/reshape_gpu_test.cpp | 4 +- .../spatial_concatenate_gpu_test.cpp | 54 +++--- .../clDNN/tests/test_cases/streams_test.cpp | 2 +- .../clDNN/tests/test_utils/network_test.h | 2 +- 156 files changed, 1066 insertions(+), 652 deletions(-) create mode 100644 inference-engine/tests/functional/plugin/gpu/shared_tests_instances/execution_graph_tests/nms_transformation_for_last_node.cpp create mode 100644 inference-engine/tests/functional/plugin/gpu/shared_tests_instances/execution_graph_tests/remove_parameter.cpp diff --git a/inference-engine/src/cldnn_engine/cldnn_graph.cpp b/inference-engine/src/cldnn_engine/cldnn_graph.cpp index 75ea9d2a251..49d5212d37e 100644 --- a/inference-engine/src/cldnn_engine/cldnn_graph.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_graph.cpp @@ -60,8 +60,6 @@ CLDNNGraph::CLDNNGraph(std::shared_ptr graph, uint16_t stream_id) void CLDNNGraph::UpdateLayersMaps() { OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNGraph::UpdateLayersMaps"); primitiveIDs = m_program->primitiveIDs; - primitivesToIRLayersMap = m_program->primitivesToIRLayersMap; - IRToNgraphLayersMap = m_program->IRToNgraphLayersMap; prevPrimitiveIDs = m_program->prevPrimitiveIDs; profilingIDs = m_program->profilingIDs; perfMap = m_program->perfMap; @@ -219,25 +217,6 @@ std::shared_ptr CLDNNGraph::GetExecGraphInfoByPrimitivesInfo(s return res; }; - auto split_string = [](std::string src, std::string delimiter = ",") -> std::vector { - std::vector tokens; - std::string tokenBuf; - size_t prev = 0, pos = 0, srcLength = src.length(), delimLength = delimiter.length(); - do { - pos = src.find(delimiter, prev); - if (pos == std::string::npos) { - pos = srcLength; - } - tokenBuf = src.substr(prev, pos - prev); - if (!tokenBuf.empty()) { - tokens.push_back(tokenBuf); - } - prev = pos + delimLength; - } while (pos < srcLength && prev < srcLength); - - return tokens; - }; - auto remove_type_from_name = [](const std::string& name) -> std::string { auto it = std::find(name.begin(), name.end(), ':'); if (it == name.end() || (it + 1) == name.end()) @@ -246,22 +225,13 @@ std::shared_ptr CLDNNGraph::GetExecGraphInfoByPrimitivesInfo(s return std::string((it+1), name.end()); }; + auto extIdMap = GetNetwork()->get_ext_id_mapping(); + auto find_origin_layers = [&](const std::string& name) -> std::vector { - if (primitivesToIRLayersMap.find(name) == primitivesToIRLayersMap.end()) + if (extIdMap.find(name) == extIdMap.end()) { return {}; - - auto cnn_names = primitivesToIRLayersMap.at(name); - std::vector res; - - for (auto& cnn_name : cnn_names) { - if (IRToNgraphLayersMap.find(cnn_name) != IRToNgraphLayersMap.end()) { - auto ngraph_names = split_string(IRToNgraphLayersMap.at(cnn_name)); - res.insert(res.end(), ngraph_names.begin(), ngraph_names.end()); - } else { - res.push_back(cnn_name); - } } - return res; + return { extIdMap.at(name) }; }; auto get_inputs = [&] (const cldnn::primitive_info& prim_info) { @@ -599,13 +569,21 @@ std::map CLDNNGraph::G auto allIds = GetNetwork()->get_all_primitive_org_ids(); auto executedPrimitives = GetNetwork()->get_executed_primitives(); auto primitivesInfo = GetNetwork()->get_primitives_info(); + auto extIdMap = GetNetwork()->get_ext_id_mapping(); - auto getUpperCaseName = [&](std::string name) { + auto getUpperCaseName = [](std::string name) { if (name.length() > 0) name[0] = toupper(name[0]); return name; }; + auto getClearName = [](std::string name) { + if (name.find(":") != std::string::npos) { + name = name.substr(name.find(":") + 1, name.length()); + } + return name; + }; + auto getFromProfiling = [&](std::string primId) -> bool { auto perfIter = perfMap.find(primId); @@ -696,10 +674,7 @@ std::map CLDNNGraph::G } } - std::string layerName = primId; - if (primId.find(":") != std::string::npos) { - layerName = primId.substr(primId.find(":") + 1, primId.length()); - } + std::string layerName = getClearName(primId); for (auto& pi : primitivesInfo) { if (pi.original_id == primId) { @@ -735,10 +710,27 @@ std::map CLDNNGraph::G } // Step 3. Checking primitives which has been deleted from execution order but added by clDNNPlugin - for (auto& primId : profilingIDs) + for (auto& primId : profilingIDs) { if (std::find(allIds.begin(), allIds.end(), primId) == allIds.end()) { getFromProfiling(primId); } + } + + for (auto& p : extIdMap) { + if (p.first.find(p.second) != std::string::npos) { + continue; + } + auto first_res = result.find(getClearName(p.first)); + auto second_res = result.find(getClearName(p.second)); + + if (first_res != result.end() && second_res != result.end() && first_res != second_res) { + std::swap(first_res->second.cpu_uSec, second_res->second.cpu_uSec); + std::swap(first_res->second.realTime_uSec, second_res->second.realTime_uSec); + std::swap(first_res->second.status, second_res->second.status); + std::swap(first_res->second.exec_type, second_res->second.exec_type); + std::swap(first_res->second.execution_index, second_res->second.execution_index); + } + } return result; } diff --git a/inference-engine/src/cldnn_engine/cldnn_graph.h b/inference-engine/src/cldnn_engine/cldnn_graph.h index 206c58aaccd..d220c4009f5 100644 --- a/inference-engine/src/cldnn_engine/cldnn_graph.h +++ b/inference-engine/src/cldnn_engine/cldnn_graph.h @@ -61,8 +61,6 @@ protected: InferenceEngine::gpu::ClContext::Ptr m_context; std::vector> m_networks; std::map primitiveIDs; - std::map> primitivesToIRLayersMap; - std::map IRToNgraphLayersMap; std::map> prevPrimitiveIDs; std::map> perfMap; diff --git a/inference-engine/src/cldnn_engine/cldnn_program.cpp b/inference-engine/src/cldnn_engine/cldnn_program.cpp index 7386501f0b1..57e96a14201 100644 --- a/inference-engine/src/cldnn_engine/cldnn_program.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_program.cpp @@ -284,14 +284,12 @@ std::vector Program::GetInputPrimitiveIDs(const std::shared void Program::AddPrimitiveToProfiler(const std::shared_ptr& op, cldnn::primitive_id customOutputId) { auto id = layer_type_name_ID(op); - primitivesToIRLayersMap[id] = { op->get_friendly_name() }; primitiveIDs[id] = customOutputId.empty() ? id : customOutputId; profilingIDs.push_back(id); } void Program::AddPrimitiveToProfiler(cldnn::primitive_id id, const std::shared_ptr& op, cldnn::primitive_id customOutputId) { - primitivesToIRLayersMap[id] = { op->get_friendly_name() }; primitiveIDs[id] = customOutputId.empty() ? id : customOutputId; profilingIDs.push_back(id); } @@ -299,7 +297,6 @@ void Program::AddPrimitiveToProfiler(cldnn::primitive_id id, const std::shared_p void Program::AddInnerPrimitiveToProfiler(cldnn::primitive_id id, cldnn::primitive_id parentId, const std::shared_ptr& op) { InitProfileInfo(id, layer_type_lower(op), false, InferenceEngine::InferenceEngineProfileInfo::EXECUTED, parentId); - primitivesToIRLayersMap[id] = { op->get_friendly_name() }; primitiveIDs[id] = id; profilingIDs.push_back(id); } diff --git a/inference-engine/src/cldnn_engine/cldnn_program.h b/inference-engine/src/cldnn_engine/cldnn_program.h index 23b6313cac9..f499104a9b9 100644 --- a/inference-engine/src/cldnn_engine/cldnn_program.h +++ b/inference-engine/src/cldnn_engine/cldnn_program.h @@ -76,8 +76,6 @@ public: static const cldnn::primitive_id m_postCustomLayerTag; std::map primitiveIDs; - std::map> primitivesToIRLayersMap; - std::map IRToNgraphLayersMap; std::map> prevPrimitiveIDs; std::map> perfMap; diff --git a/inference-engine/src/cldnn_engine/ops/batch_to_space.cpp b/inference-engine/src/cldnn_engine/ops/batch_to_space.cpp index e46643f0617..d67f9e9888a 100644 --- a/inference-engine/src/cldnn_engine/ops/batch_to_space.cpp +++ b/inference-engine/src/cldnn_engine/ops/batch_to_space.cpp @@ -42,7 +42,8 @@ void CreateBatchToSpaceOp(Program& p, const std::shared_ptrget_friendly_name()); p.AddPrimitive(batchToSpacePrim); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/broadcast.cpp b/inference-engine/src/cldnn_engine/ops/broadcast.cpp index 6d6e6303200..16264a31fea 100644 --- a/inference-engine/src/cldnn_engine/ops/broadcast.cpp +++ b/inference-engine/src/cldnn_engine/ops/broadcast.cpp @@ -31,8 +31,13 @@ static void CreateCommonBroadcastOp(Program& p, const std::shared_ptrget_input_element_type(0)); - auto reorderPrim = cldnn::reorder(reorderName, inputPrimitive, targetFormat, targetDatatype); - + auto reorderPrim = cldnn::reorder(reorderName, + inputPrimitive, + targetFormat, + targetDatatype, + std::vector(), + cldnn::reorder_mean_mode::subtract, + op->get_friendly_name()); p.AddPrimitive(reorderPrim); p.AddInnerPrimitiveToProfiler(reorderName, layerName, op); @@ -66,7 +71,7 @@ static void CreateCommonBroadcastOp(Program& p, const std::shared_ptrget_friendly_name()); p.AddPrimitive(reshapePrim); p.AddInnerPrimitiveToProfiler(reshapeName, layerName, op); @@ -75,7 +80,9 @@ static void CreateCommonBroadcastOp(Program& p, const std::shared_ptrget_output_shape(0))); + CldnnTensorFromIEDims(op->get_output_shape(0)), + {}, + op->get_friendly_name()); p.AddPrimitive(broadcastPrim); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/concat.cpp b/inference-engine/src/cldnn_engine/ops/concat.cpp index 453e9996530..fc1e51d1981 100644 --- a/inference-engine/src/cldnn_engine/ops/concat.cpp +++ b/inference-engine/src/cldnn_engine/ops/concat.cpp @@ -45,7 +45,8 @@ void CreateConcatOp(Program& p, const std::shared_ptr& o layerName, inputPrimitives, GetConcatAxis(op->get_axis(), op->get_input_shape(0).size()), - DataTypeFromPrecision(op->get_output_element_type(0))); + DataTypeFromPrecision(op->get_output_element_type(0)), + op->get_friendly_name()); p.AddPrimitive(concatPrim); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/constant.cpp b/inference-engine/src/cldnn_engine/ops/constant.cpp index fea42f31d98..5f28cd2e602 100644 --- a/inference-engine/src/cldnn_engine/ops/constant.cpp +++ b/inference-engine/src/cldnn_engine/ops/constant.cpp @@ -199,7 +199,7 @@ void CreateConstantOp(Program& p, const std::shared_ptrget_friendly_name())); p.blobMemCache[std::make_pair(data, constDims)] = initialconstPrimID; constPrimID = initialconstPrimID; } diff --git a/inference-engine/src/cldnn_engine/ops/convert.cpp b/inference-engine/src/cldnn_engine/ops/convert.cpp index 6af5bee759d..603eb26abeb 100644 --- a/inference-engine/src/cldnn_engine/ops/convert.cpp +++ b/inference-engine/src/cldnn_engine/ops/convert.cpp @@ -19,8 +19,13 @@ void CreateConvertLikeOp(Program& p, const std::shared_ptrget_input_element_type(1)); - auto reorderPrim = cldnn::reorder(layerName, inputPrimitives[0], cldnn::format::any, outDataType); - + auto reorderPrim = cldnn::reorder(layerName, + inputPrimitives[0], + cldnn::format::any, + outDataType, + std::vector(), + cldnn::reorder_mean_mode::subtract, + op->get_friendly_name()); p.AddPrimitive(reorderPrim); p.AddPrimitiveToProfiler(op); } @@ -32,7 +37,13 @@ void CreateConvertOp(Program& p, const std::shared_ptr& auto outDataType = DataTypeFromPrecision(op->get_destination_type()); - auto reorderPrim = cldnn::reorder(layerName, inputPrimitives[0], cldnn::format::any, outDataType); + auto reorderPrim = cldnn::reorder(layerName, + inputPrimitives[0], + cldnn::format::any, + outDataType, + std::vector(), + cldnn::reorder_mean_mode::subtract, + op->get_friendly_name()); p.AddPrimitive(reorderPrim); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/convolution.cpp b/inference-engine/src/cldnn_engine/ops/convolution.cpp index 83f536a68b7..e8c44693dbd 100644 --- a/inference-engine/src/cldnn_engine/ops/convolution.cpp +++ b/inference-engine/src/cldnn_engine/ops/convolution.cpp @@ -84,7 +84,8 @@ void CreateGroupConvolutionOp(Program& p, const std::shared_ptrget_friendly_name()); p.AddPrimitive(convPrim); p.AddPrimitiveToProfiler(op); @@ -112,7 +113,8 @@ void CreateConvolutionOp(Program& p, const std::shared_ptrget_friendly_name()); p.AddPrimitive(convPrim); p.AddPrimitiveToProfiler(op); @@ -146,7 +148,8 @@ void CreateConvolutionBackpropDataOp(Program& p, const std::shared_ptrget_friendly_name()); p.AddPrimitive(permutePrim); p.AddInnerPrimitiveToProfiler(permuteName, layerName, op); @@ -159,14 +162,15 @@ void CreateConvolutionBackpropDataOp(Program& p, const std::shared_ptrget_pads_begin(), op->get_dilations(), op->get_strides(), 1); auto deconvPrim = cldnn::deconvolution(layerName, - inputs[0], - weights, - {}, - params.groups, - params.stride, - params.padding, - CldnnTensorFromIEDims(op->get_output_tensor(0).get_shape()), - weights_have_group_dim); + inputs[0], + weights, + {}, + params.groups, + params.stride, + params.padding, + CldnnTensorFromIEDims(op->get_output_tensor(0).get_shape()), + weights_have_group_dim, + op->get_friendly_name()); p.AddPrimitive(deconvPrim); p.AddPrimitiveToProfiler(op); @@ -202,7 +206,8 @@ void CreateGroupConvolutionBackpropDataOp(Program& p, const std::shared_ptrget_friendly_name()); p.AddPrimitive(permutePrim); p.AddInnerPrimitiveToProfiler(permuteName, layerName, op); @@ -214,14 +219,15 @@ void CreateGroupConvolutionBackpropDataOp(Program& p, const std::shared_ptrget_output_tensor(0).get_shape()), - weights_have_group_dim); + inputs[0], + weights, + {}, + params.groups, + params.stride, + params.padding, + CldnnTensorFromIEDims(op->get_output_tensor(0).get_shape()), + weights_have_group_dim, + op->get_friendly_name()); p.AddPrimitive(deconvPrim); p.AddPrimitiveToProfiler(op); @@ -247,7 +253,8 @@ void CreateDeformableConvolutionOp(Program& p, const std::shared_ptrget_friendly_name()); p.AddPrimitive(convPrim); p.AddPrimitiveToProfiler(op); @@ -280,7 +287,8 @@ void CreateDeformableConvolutionOp(Program& p, const std::shared_ptrget_friendly_name()); p.AddPrimitive(defConvPrimInterp); p.AddInnerPrimitiveToProfiler(defConvLayerNameInterp, defConvLayerNameConv, op); auto defConvPrim = cldnn::deformable_conv(defConvLayerNameConv, @@ -288,7 +296,8 @@ void CreateDeformableConvolutionOp(Program& p, const std::shared_ptrget_friendly_name()); p.AddPrimitive(defConvPrim); p.AddPrimitiveToProfiler(defConvLayerNameConv, op); } @@ -313,7 +322,8 @@ void CreateBinaryConvolutionOp(Program& p, const std::shared_ptrget_pad_value(), - calc_precision); + calc_precision, + op->get_friendly_name()); p.AddPrimitive(convPrim); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/ctc_greedy_decoder.cpp b/inference-engine/src/cldnn_engine/ops/ctc_greedy_decoder.cpp index c8bd8d54e07..7dde9c8b92a 100644 --- a/inference-engine/src/cldnn_engine/ops/ctc_greedy_decoder.cpp +++ b/inference-engine/src/cldnn_engine/ops/ctc_greedy_decoder.cpp @@ -33,7 +33,10 @@ void CreateCommonCTCGreedyDecoderOp(Program& p, const std::shared_ptr(), + cldnn::reorder_mean_mode::subtract, + op->get_friendly_name()); p.AddPrimitive(preprocessPrim); p.AddInnerPrimitiveToProfiler(reorderPrimName, layer_type_name_ID(op), op); reorderedInputs[portIndex] = (reorderPrimName); @@ -73,8 +76,9 @@ void CreateCommonCTCGreedyDecoderOp(Program& p, const std::shared_ptrget_friendly_name() }; + auto ctc_gd_mutable_prim = cldnn::mutable_data(ctc_gd_mutable_id_w, + shared_memory[0], + op->get_friendly_name()); p.primitiveIDs[ctc_gd_mutable_id_w] = ctc_gd_mutable_id_w; p.AddPrimitive(ctc_gd_mutable_prim); reorderedInputs.push_back(ctc_gd_mutable_id_w); @@ -86,7 +90,8 @@ void CreateCommonCTCGreedyDecoderOp(Program& p, const std::shared_ptrget_output_shape(0))); + CldnnTensorFromIEDims(op->get_output_shape(0)), + op->get_friendly_name()); // clDNN primitive supports only i32 as output data type primitive.output_data_type = DataTypeFromPrecision(ngraph::element::i32); @@ -99,8 +104,10 @@ void CreateCommonCTCGreedyDecoderOp(Program& p, const std::shared_ptrget_friendly_name() }; + auto ctc_gd_mutable_prim_r = cldnn::mutable_data(ctc_gd_mutable_id_r, + { CTCGreedyDecoderLayerName }, + shared_memory[0], + op->get_friendly_name()); p.primitiveIDs[ctc_gd_mutable_id_r] = ctc_gd_mutable_id_r; p.AddPrimitive(ctc_gd_mutable_prim_r); } diff --git a/inference-engine/src/cldnn_engine/ops/cum_sum.cpp b/inference-engine/src/cldnn_engine/ops/cum_sum.cpp index 1bdcec2957e..9f8e2a463f2 100644 --- a/inference-engine/src/cldnn_engine/ops/cum_sum.cpp +++ b/inference-engine/src/cldnn_engine/ops/cum_sum.cpp @@ -63,7 +63,8 @@ void CreateCumSumOp(Program& p, const std::shared_ptr& o inputPrimitives[0], GetCumSumAxis(axis, rank), exclusive, - reverse); + reverse, + op->get_friendly_name()); p.AddPrimitive(primitive); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/custom.cpp b/inference-engine/src/cldnn_engine/ops/custom.cpp index 85945bfbdb9..c967100d1f9 100644 --- a/inference-engine/src/cldnn_engine/ops/custom.cpp +++ b/inference-engine/src/cldnn_engine/ops/custom.cpp @@ -145,7 +145,10 @@ void CreateCustomOp(Program& p, const std::shared_ptr& op, CLDNNCu reorderPrimName, inputPrimitives[param.portIndex], param.format, - DataTypeFromPrecision(op->get_input_element_type(param.portIndex))); + DataTypeFromPrecision(op->get_input_element_type(param.portIndex)), + std::vector(), + cldnn::reorder_mean_mode::subtract, + op->get_friendly_name()); p.AddPrimitive(preprocessPrim); p.AddInnerPrimitiveToProfiler(reorderPrimName, layer_type_name_ID(op), op); @@ -229,7 +232,8 @@ void CreateCustomOp(Program& p, const std::shared_ptr& op, CLDNNCu customLayer->CompilerOptions(), outputLayout, gws, - lws); + lws, + op->get_friendly_name()); auto prevLayerName = genericLayerName; if (outputLayout.format != cldnn::format::any) { @@ -239,7 +243,10 @@ void CreateCustomOp(Program& p, const std::shared_ptr& op, CLDNNCu cldnn::reorder(reorderPrimName, genericLayerName, DefaultFormatForDims(op->get_output_shape(0).size()), - customPrim.output_layout.data_type)); + customPrim.output_layout.data_type, + std::vector(), + cldnn::reorder_mean_mode::subtract, + op->get_friendly_name())); prevLayerName = reorderPrimName; p.AddInnerPrimitiveToProfiler(reorderPrimName, layer_type_name_ID(op), op); } diff --git a/inference-engine/src/cldnn_engine/ops/depth_to_space.cpp b/inference-engine/src/cldnn_engine/ops/depth_to_space.cpp index b53262ab23d..aa762a356d7 100644 --- a/inference-engine/src/cldnn_engine/ops/depth_to_space.cpp +++ b/inference-engine/src/cldnn_engine/ops/depth_to_space.cpp @@ -33,7 +33,8 @@ void CreateDepthToSpaceOp(Program& p, const std::shared_ptrget_friendly_name()); p.AddPrimitive(depthToSpacePrim); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/detection_output.cpp b/inference-engine/src/cldnn_engine/ops/detection_output.cpp index aa2b505f0e7..80616090655 100644 --- a/inference-engine/src/cldnn_engine/ops/detection_output.cpp +++ b/inference-engine/src/cldnn_engine/ops/detection_output.cpp @@ -75,7 +75,8 @@ void CreateDetectionOutputOp(Program& p, const std::shared_ptrget_friendly_name()); p.AddPrimitive(detectionPrim); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/eltwise.cpp b/inference-engine/src/cldnn_engine/ops/eltwise.cpp index 817512d5bf9..f486bff593d 100644 --- a/inference-engine/src/cldnn_engine/ops/eltwise.cpp +++ b/inference-engine/src/cldnn_engine/ops/eltwise.cpp @@ -46,7 +46,13 @@ void CreateElementwiseOp(Program& p, const std::shared_ptr& op, cl if (targetFormat.value != DefaultFormatForDims(inputRank).value) { auto reorderName = layerName + "_cldnn_in" + std::to_string(i) + "_reorder"; auto targetDatatype = DataTypeFromPrecision(op->get_input_element_type(i)); - auto reorderPrim = cldnn::reorder(reorderName, inputPrimitives[i], targetFormat, targetDatatype); + auto reorderPrim = cldnn::reorder(reorderName, + inputPrimitives[i], + targetFormat, + targetDatatype, + std::vector(), + cldnn::reorder_mean_mode::subtract, + op->get_friendly_name()); p.AddPrimitive(reorderPrim); p.AddInnerPrimitiveToProfiler(reorderName, layerName, op); @@ -61,7 +67,7 @@ void CreateElementwiseOp(Program& p, const std::shared_ptr& op, cl auto targetShape = CldnnTensorFromIEDims(inputShape); - auto reshapePrim = cldnn::reshape(reshapeName, inputPrimitives[i], targetShape); + auto reshapePrim = cldnn::reshape(reshapeName, inputPrimitives[i], targetShape, op->get_friendly_name()); p.AddPrimitive(reshapePrim); p.AddInnerPrimitiveToProfiler(reshapeName, layerName, op); @@ -74,7 +80,8 @@ void CreateElementwiseOp(Program& p, const std::shared_ptr& op, cl inputPrimitives, mode, {}, - out_dt); + out_dt, + op->get_friendly_name()); p.AddPrimitive(eltwisePrim); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/embedding_bag.cpp b/inference-engine/src/cldnn_engine/ops/embedding_bag.cpp index 2e97a60aebf..369c0eca44a 100644 --- a/inference-engine/src/cldnn_engine/ops/embedding_bag.cpp +++ b/inference-engine/src/cldnn_engine/ops/embedding_bag.cpp @@ -49,7 +49,10 @@ void CreateEmbeddingBagOffsetsSumOp(Program& p, const std::shared_ptr(), + cldnn::reorder_mean_mode::subtract, + op->get_friendly_name()); p.AddPrimitive(preprocessPrim); p.AddInnerPrimitiveToProfiler(reorderPrimName, layer_type_name_ID(op), op); reorderedInputs[portIndex] = (reorderPrimName); @@ -62,7 +65,8 @@ void CreateEmbeddingBagOffsetsSumOp(Program& p, const std::shared_ptrget_output_shape(0)), - defaultIndex); + defaultIndex, + op->get_friendly_name()); p.AddPrimitive(embeddingBagPrim); p.AddPrimitiveToProfiler(op); @@ -86,7 +90,10 @@ void CreateEmbeddingBagPackedSumOp(Program& p, const std::shared_ptr(), + cldnn::reorder_mean_mode::subtract, + op->get_friendly_name()); p.AddPrimitive(preprocessPrim); p.AddInnerPrimitiveToProfiler(reorderPrimName, layer_type_name_ID(op), op); reorderedInputs[portIndex] = (reorderPrimName); @@ -98,7 +105,9 @@ void CreateEmbeddingBagPackedSumOp(Program& p, const std::shared_ptrget_output_shape(0))); + CldnnTensorFromIEDims(op->get_output_shape(0)), + -1, + op->get_friendly_name()); p.AddPrimitive(embeddingBagPrim); p.AddPrimitiveToProfiler(op); @@ -140,7 +149,10 @@ void CreateEmbeddingSegmentsSumOp(Program& p, const std::shared_ptr(), + cldnn::reorder_mean_mode::subtract, + op->get_friendly_name()); p.AddPrimitive(preprocessPrim); p.AddInnerPrimitiveToProfiler(reorderPrimName, layer_type_name_ID(op), op); reorderedInputs[portIndex] = (reorderPrimName); @@ -153,7 +165,8 @@ void CreateEmbeddingSegmentsSumOp(Program& p, const std::shared_ptrget_output_shape(0)), - defaultIndex); + defaultIndex, + op->get_friendly_name()); p.AddPrimitive(embeddingBagPrim); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/extract_image_patches.cpp b/inference-engine/src/cldnn_engine/ops/extract_image_patches.cpp index 23b5f014320..088a0fcd413 100644 --- a/inference-engine/src/cldnn_engine/ops/extract_image_patches.cpp +++ b/inference-engine/src/cldnn_engine/ops/extract_image_patches.cpp @@ -38,7 +38,8 @@ void CreateExtractImagePatchesOp(Program& p, const std::shared_ptrget_output_shape(0))); + CldnnTensorFromIEDims(op->get_output_shape(0)), + op->get_friendly_name()); p.AddPrimitive(extractImagePatchesPrim); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/fake_quantize.cpp b/inference-engine/src/cldnn_engine/ops/fake_quantize.cpp index 345a70f34bb..52ea33e5d3d 100644 --- a/inference-engine/src/cldnn_engine/ops/fake_quantize.cpp +++ b/inference-engine/src/cldnn_engine/ops/fake_quantize.cpp @@ -31,7 +31,8 @@ void CreateFakeQuantizeOp(Program& p, const std::shared_ptrget_friendly_name()); p.AddPrimitive(quantizationPrim); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/gather tree.cpp b/inference-engine/src/cldnn_engine/ops/gather tree.cpp index 6b73131fd29..5476aa40d97 100644 --- a/inference-engine/src/cldnn_engine/ops/gather tree.cpp +++ b/inference-engine/src/cldnn_engine/ops/gather tree.cpp @@ -30,7 +30,10 @@ void CreateGatherTreeOp(Program& p, const std::shared_ptr(), + cldnn::reorder_mean_mode::subtract, + op->get_friendly_name()); p.AddPrimitive(preprocessPrim); p.AddInnerPrimitiveToProfiler(reorderPrimName, layerName, op); reorderedInputs[portIndex] = reorderPrimName; @@ -43,7 +46,8 @@ void CreateGatherTreeOp(Program& p, const std::shared_ptrget_friendly_name()); p.AddPrimitive(gatherTreePrim); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/gather.cpp b/inference-engine/src/cldnn_engine/ops/gather.cpp index d22258e0673..bcf632f3194 100644 --- a/inference-engine/src/cldnn_engine/ops/gather.cpp +++ b/inference-engine/src/cldnn_engine/ops/gather.cpp @@ -77,7 +77,10 @@ void CreateGatherOpBase(Program& p, const std::shared_ptr& op, const int64_t auto preprocessPrim = cldnn::reorder(reorderPrimName, inputPrimitives[portIndex], targetFormat, - cldnn::data_types::i32); + cldnn::data_types::i32, + std::vector(), + cldnn::reorder_mean_mode::subtract, + op->get_friendly_name()); p.AddPrimitive(preprocessPrim); p.AddInnerPrimitiveToProfiler(reorderPrimName, layerName, op); reorderedInputs[portIndex] = reorderPrimName; @@ -94,7 +97,8 @@ void CreateGatherOpBase(Program& p, const std::shared_ptr& op, const int64_t outLayout, CldnnTensorFromIEDims(op->get_output_shape(0)), batch_dim, - support_neg_ind); + support_neg_ind, + op->get_friendly_name()); p.AddPrimitive(gatherPrim); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/gather_elements.cpp b/inference-engine/src/cldnn_engine/ops/gather_elements.cpp index d6138280750..50c25c37f1c 100644 --- a/inference-engine/src/cldnn_engine/ops/gather_elements.cpp +++ b/inference-engine/src/cldnn_engine/ops/gather_elements.cpp @@ -55,7 +55,8 @@ void CreateGatherElementsOp(Program& p, const std::shared_ptrget_output_shape(0)), - GetGatherAxis(axis, rank)); + GetGatherAxis(axis, rank), + op->get_friendly_name()); p.AddPrimitive(primitive); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/gather_nd.cpp b/inference-engine/src/cldnn_engine/ops/gather_nd.cpp index cbdc5659bb3..266ada7f904 100644 --- a/inference-engine/src/cldnn_engine/ops/gather_nd.cpp +++ b/inference-engine/src/cldnn_engine/ops/gather_nd.cpp @@ -22,10 +22,11 @@ void CreateGatherNDOp(Program& p, const std::shared_ptrget_batch_dims(); auto primitive = cldnn::gather_nd(layerName, - inputPrimitives[0], - inputPrimitives[1], - indices_rank, - batch_dims); + inputPrimitives[0], + inputPrimitives[1], + indices_rank, + batch_dims, + op->get_friendly_name()); p.AddPrimitive(primitive); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/grn.cpp b/inference-engine/src/cldnn_engine/ops/grn.cpp index 960dd034947..c6d07fe6191 100644 --- a/inference-engine/src/cldnn_engine/ops/grn.cpp +++ b/inference-engine/src/cldnn_engine/ops/grn.cpp @@ -19,7 +19,8 @@ void CreateGRNOp(Program& p, const std::shared_ptr& op) { auto primitive = cldnn::grn(layerName, inputPrimitives[0], op->get_bias(), - DataTypeFromPrecision(op->get_output_element_type(0))); + DataTypeFromPrecision(op->get_output_element_type(0)), + op->get_friendly_name()); p.AddPrimitive(primitive); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/interpolate.cpp b/inference-engine/src/cldnn_engine/ops/interpolate.cpp index df99e6972ee..190032897b4 100644 --- a/inference-engine/src/cldnn_engine/ops/interpolate.cpp +++ b/inference-engine/src/cldnn_engine/ops/interpolate.cpp @@ -193,7 +193,8 @@ void CreateInterpolateOp(Program& p, const std::shared_ptrget_friendly_name()); p.AddPrimitive(resamplePrim); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/loop.cpp b/inference-engine/src/cldnn_engine/ops/loop.cpp index 1ac452265b8..604f73b7e5a 100644 --- a/inference-engine/src/cldnn_engine/ops/loop.cpp +++ b/inference-engine/src/cldnn_engine/ops/loop.cpp @@ -29,11 +29,11 @@ using Loop = ngraph::op::v5::Loop; namespace CLDNNPlugin { template -static DATA_TYPE CreateScalarData(Program &p, const cldnn::primitive_id& id, int64_t num) { +static DATA_TYPE CreateScalarData(Program &p, const cldnn::primitive_id& id, int64_t num, const cldnn::primitive_id& ext_prim_id) { auto mem = p.GetEngine().allocate_memory({ cldnn::data_types::i64, cldnn::format::bfyx, { 1, 1, 1, 1 } }); cldnn::mem_lock ptr{mem, p.GetEngine().get_program_stream()}; *ptr.begin() = num; - return {id, mem}; + return {id, mem, ext_prim_id}; } static cldnn::mutable_data CreateAdditionalOutputData(Program &p, const std::shared_ptr& op, @@ -44,7 +44,7 @@ static cldnn::mutable_data CreateAdditionalOutputData(Program &p, const std::sha const auto tensor = CldnnTensorFromIEDims(op->get_output_shape(output_idx)); cldnn::layout output_layout = cldnn::layout(precision, format, tensor); auto mem = p.GetEngine().allocate_memory(output_layout); - auto md = cldnn::mutable_data(id, {input}, mem); // cldnn::data cannot set dependency + auto md = cldnn::mutable_data(id, {input}, mem, op->get_friendly_name()); // cldnn::data cannot set dependency return md; } @@ -161,8 +161,7 @@ void CreateLoopOp(Program& p, const std::shared_ptr& op) { } const cldnn::primitive_id num_iteration_id = layerName + "_numIteration"; { - cldnn::mutable_data num_iteration = CreateScalarData(p, num_iteration_id, 0); - p.primitivesToIRLayersMap[num_iteration_id] = { op->get_friendly_name() }; + cldnn::mutable_data num_iteration = CreateScalarData(p, num_iteration_id, 0, op->get_friendly_name()); p.primitiveIDs[num_iteration_id] = num_iteration_id; p.AddPrimitive(num_iteration); p.AddInnerPrimitiveToProfiler(num_iteration_id, layerName, op); @@ -216,7 +215,8 @@ void CreateLoopOp(Program& p, const std::shared_ptr& op) { back_edges, /* back edge mapping */ num_iterations, /* max iteration, i.e. length of iteration axis */ body_current_iteration_id, - body_execution_condition_id); + body_execution_condition_id, + op->get_friendly_name()); p.AddPrimitive(loopPrimitive); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/lrn.cpp b/inference-engine/src/cldnn_engine/ops/lrn.cpp index c13c17daaeb..28e55bf038f 100644 --- a/inference-engine/src/cldnn_engine/ops/lrn.cpp +++ b/inference-engine/src/cldnn_engine/ops/lrn.cpp @@ -38,7 +38,8 @@ void CreateLRNOp(Program& p, const std::shared_ptr& op) { static_cast(op->get_bias()), static_cast(op->get_alpha()), static_cast(op->get_beta()), - GetNormRegion(axis_value)); + GetNormRegion(axis_value), + op->get_friendly_name()); p.AddPrimitive(lrnPrim); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/matmul.cpp b/inference-engine/src/cldnn_engine/ops/matmul.cpp index 3d09fc7fd4e..53b8fecd6c2 100644 --- a/inference-engine/src/cldnn_engine/ops/matmul.cpp +++ b/inference-engine/src/cldnn_engine/ops/matmul.cpp @@ -89,7 +89,8 @@ void CreateMatMulOp(Program& p, const std::shared_ptr& o auto permuteName = op->get_friendly_name() + "/transpose_b"; auto permutePrim = cldnn::permute(permuteName, weightsName, - cldnn_permute_order); + cldnn_permute_order, + op->get_friendly_name()); p.AddPrimitive(permutePrim); p.AddInnerPrimitiveToProfiler(permuteName, layerName, op); weightsName = permuteName; @@ -108,7 +109,8 @@ void CreateMatMulOp(Program& p, const std::shared_ptr& o auto permuteName = op->get_friendly_name() + "/transpose_a"; auto permutePrim = cldnn::permute(permuteName, inputName, - cldnn_permute_order); + cldnn_permute_order, + op->get_friendly_name()); p.AddPrimitive(permutePrim); p.AddInnerPrimitiveToProfiler(permuteName, layerName, op); inputName = permuteName; @@ -124,7 +126,10 @@ void CreateMatMulOp(Program& p, const std::shared_ptr& o IE_THROW() << "Inconsistent reshape in Matmul op: " << op->get_friendly_name(); auto reshapeInName = op->get_friendly_name() + suffix; - auto reshapeInPrim = cldnn::reshape(reshapeInName, inputName, CldnnTensorFromIEDims(reshapeSize)); + auto reshapeInPrim = cldnn::reshape(reshapeInName, + inputName, + CldnnTensorFromIEDims(reshapeSize), + op->get_friendly_name()); p.AddPrimitive(reshapeInPrim); p.AddInnerPrimitiveToProfiler(reshapeInName, layerName, op); return reshapeInName; @@ -144,6 +149,7 @@ void CreateMatMulOp(Program& p, const std::shared_ptr& o weightsName, "", DataTypeFromPrecision(op->get_output_element_type(0)), + op->get_friendly_name(), cldnn::padding(), input_rank); @@ -153,7 +159,7 @@ void CreateMatMulOp(Program& p, const std::shared_ptr& o if (reshape_fc) { auto outputShape = CldnnTensorFromIEDims(op->get_output_shape(0)); auto outReshapeName = layerName + "_cldnn_out_reshape"; - auto outReshapePrim = cldnn::reshape(outReshapeName, layerName, outputShape); + auto outReshapePrim = cldnn::reshape(outReshapeName, layerName, outputShape, op->get_friendly_name()); p.AddPrimitive(outReshapePrim); p.AddInnerPrimitiveToProfiler(outReshapeName, layerName, op); @@ -188,7 +194,13 @@ void CreateMatMulOp(Program& p, const std::shared_ptr& o if (targetFormat.value != DefaultFormatForDims(inputDimsN).value) { auto reorderName = layerName + "_cldnn_in" + std::to_string(i) + "_reorder"; auto targetDatatype = DataTypeFromPrecision(op->get_output_element_type(0)); - auto reorderPrim = cldnn::reorder(reorderName, inputPrimitives[i], targetFormat, targetDatatype); + auto reorderPrim = cldnn::reorder(reorderName, + inputPrimitives[i], + targetFormat, + targetDatatype, + std::vector(), + cldnn::reorder_mean_mode::subtract, + op->get_friendly_name()); p.AddPrimitive(reorderPrim); p.AddInnerPrimitiveToProfiler(reorderName, layerName, op); @@ -227,7 +239,7 @@ void CreateMatMulOp(Program& p, const std::shared_ptr& o auto targetShape = gemmSpecificTensor(inputDims); - auto reshapePrim = cldnn::reshape(reshapeName, inputPrimitives[i], targetShape); + auto reshapePrim = cldnn::reshape(reshapeName, inputPrimitives[i], targetShape, op->get_friendly_name()); p.AddPrimitive(reshapePrim); p.AddInnerPrimitiveToProfiler(reshapeName, layerName, op); @@ -248,7 +260,8 @@ void CreateMatMulOp(Program& p, const std::shared_ptr& o transA, transB, alpha, - beta); + beta, + op->get_friendly_name()); p.AddPrimitive(gemmPrim); @@ -258,7 +271,7 @@ void CreateMatMulOp(Program& p, const std::shared_ptr& o if (outDimsN < 4) { auto outputShape = CldnnTensorFromIEDims(outDims); auto outReshapeName = layerName + "_cldnn_out_reshape"; - auto outReshapePrim = cldnn::reshape(outReshapeName, layerName, outputShape); + auto outReshapePrim = cldnn::reshape(outReshapeName, layerName, outputShape, op->get_friendly_name()); p.AddPrimitive(outReshapePrim); p.AddInnerPrimitiveToProfiler(outReshapeName, layerName, op); diff --git a/inference-engine/src/cldnn_engine/ops/mvn.cpp b/inference-engine/src/cldnn_engine/ops/mvn.cpp index b9cb376a24e..abd2128326b 100644 --- a/inference-engine/src/cldnn_engine/ops/mvn.cpp +++ b/inference-engine/src/cldnn_engine/ops/mvn.cpp @@ -24,7 +24,8 @@ static void CreateCommonMVNOp(Program& p, const std::shared_ptr& o normalize_variance, eps, eps_inside_sqrt, - across_channels); + across_channels, + op->get_friendly_name()); p.AddPrimitive(mvnPrim); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/non_max_suppression.cpp b/inference-engine/src/cldnn_engine/ops/non_max_suppression.cpp index 8adaa3cfa76..df34657d6cb 100644 --- a/inference-engine/src/cldnn_engine/ops/non_max_suppression.cpp +++ b/inference-engine/src/cldnn_engine/ops/non_max_suppression.cpp @@ -41,7 +41,10 @@ void CreateNonMaxSuppressionIEInternalOp(Program& p, const std::shared_ptr(), + cldnn::reorder_mean_mode::subtract, + op->get_friendly_name()); p.AddPrimitive(preprocessPrim); p.AddInnerPrimitiveToProfiler(reorderPrimName, layer_type_name_ID(op), op); reorderedInputs[portIndex] = (reorderPrimName); @@ -77,8 +80,9 @@ void CreateNonMaxSuppressionIEInternalOp(Program& p, const std::shared_ptrget_friendly_name() }; + auto nms_mutable_prim_second = cldnn::mutable_data(non_max_supression_mutable_id_w_second, + shared_memory.back(), + op->get_friendly_name()); p.primitiveIDs[non_max_supression_mutable_id_w_second] = non_max_supression_mutable_id_w_second; p.AddPrimitive(nms_mutable_prim_second); inputPrimitives.push_back(non_max_supression_mutable_id_w_second); @@ -94,8 +98,9 @@ void CreateNonMaxSuppressionIEInternalOp(Program& p, const std::shared_ptrget_friendly_name() }; + auto nms_mutable_prim_first = cldnn::mutable_data(non_max_supression_mutable_id_w_first, + shared_memory.back(), + op->get_friendly_name()); p.primitiveIDs[non_max_supression_mutable_id_w_first] = non_max_supression_mutable_id_w_first; p.AddPrimitive(nms_mutable_prim_first); inputPrimitives.push_back(non_max_supression_mutable_id_w_first); @@ -112,7 +117,9 @@ void CreateNonMaxSuppressionIEInternalOp(Program& p, const std::shared_ptr(outputIndices), op->m_center_point_box, - op->m_sort_result_descending); + op->m_sort_result_descending, + "", "", "", "", "", "", + op->get_friendly_name()); prim.output_data_type = DataTypeFromPrecision(out_type); @@ -136,15 +143,19 @@ void CreateNonMaxSuppressionIEInternalOp(Program& p, const std::shared_ptrget_friendly_name() }; + auto nms_mutable_prim_r_second = cldnn::mutable_data(non_max_supression_id_r_second, + { nonMaxSupressionLayerName }, + shared_memory.front(), + op->get_friendly_name()); p.primitiveIDs[non_max_supression_id_r_second] = non_max_supression_id_r_second; p.AddPrimitive(nms_mutable_prim_r_second); } case 2: { cldnn::primitive_id non_max_supression_id_r_first = layer_type_name_ID(op) + ".1"; - auto nms_mutable_prim_r_first = cldnn::mutable_data(non_max_supression_id_r_first, { nonMaxSupressionLayerName }, shared_memory.back()); - p.primitivesToIRLayersMap[non_max_supression_id_r_first] = { op->get_friendly_name() }; + auto nms_mutable_prim_r_first = cldnn::mutable_data(non_max_supression_id_r_first, + { nonMaxSupressionLayerName }, + shared_memory.back(), + op->get_friendly_name()); p.primitiveIDs[non_max_supression_id_r_first] = non_max_supression_id_r_first; p.AddPrimitive(nms_mutable_prim_r_first); } diff --git a/inference-engine/src/cldnn_engine/ops/normalize_l2.cpp b/inference-engine/src/cldnn_engine/ops/normalize_l2.cpp index 85f2eb95de8..315dee55952 100644 --- a/inference-engine/src/cldnn_engine/ops/normalize_l2.cpp +++ b/inference-engine/src/cldnn_engine/ops/normalize_l2.cpp @@ -45,14 +45,15 @@ void CreateNormalizeL2Op(Program& p, const std::shared_ptrget_data_ptr(), bufSize); auto scalesName = layerName + "_cldnn_input_scales"; - p.AddPrimitive(cldnn::data(scalesName, mem)); + p.AddPrimitive(cldnn::data(scalesName, mem, op->get_friendly_name())); p.AddInnerPrimitiveToProfiler(scalesName, layerName, op); auto normPrim = cldnn::normalize(layerName, inputPrimitives[0], scalesName, across_spatial, - eps); + eps, + op->get_friendly_name()); p.AddPrimitive(normPrim); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/one_hot.cpp b/inference-engine/src/cldnn_engine/ops/one_hot.cpp index 3d792bda8ae..b7c4fe8a126 100644 --- a/inference-engine/src/cldnn_engine/ops/one_hot.cpp +++ b/inference-engine/src/cldnn_engine/ops/one_hot.cpp @@ -53,7 +53,8 @@ void CreateOneHotOp(Program& p, const std::shared_ptr& o DataTypeFromPrecision(op->get_output_element_type(0)), static_cast(axis), on_value, - off_value); + off_value, + op->get_friendly_name()); p.AddPrimitive(oneHotPrim); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/pad.cpp b/inference-engine/src/cldnn_engine/ops/pad.cpp index 0d409414b58..40336df057c 100644 --- a/inference-engine/src/cldnn_engine/ops/pad.cpp +++ b/inference-engine/src/cldnn_engine/ops/pad.cpp @@ -66,7 +66,8 @@ void CreatePadOp(Program& p, const std::shared_ptr& op) { pads_begin, pads_end, border_mode, - pad_value); + pad_value, + op->get_friendly_name()); p.AddPrimitive(tilePrim); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/parameter.cpp b/inference-engine/src/cldnn_engine/ops/parameter.cpp index b68593dd0a5..6dd43841cf3 100644 --- a/inference-engine/src/cldnn_engine/ops/parameter.cpp +++ b/inference-engine/src/cldnn_engine/ops/parameter.cpp @@ -195,8 +195,8 @@ void CreateParameterOp(Program& p, const std::shared_ptrname()); + auto inputUV = cldnn::input_layout(uv_name, uv_layout, inputInfo->name()); p.AddPrimitive(inputY); p.inputLayouts.insert({ inputInfo->name() + "_Y" + std::to_string(i), y_layout }); @@ -205,20 +205,29 @@ void CreateParameterOp(Program& p, const std::shared_ptrname())); break; } case MEAN_IMAGE: { - p.AddPrimitive(cldnn::reorder(preprocessPrimID, y_name, uv_name, networkInputLayout, meanBlobID)); + p.AddPrimitive(cldnn::reorder(preprocessPrimID, + y_name, + uv_name, + networkInputLayout, + meanBlobID, + cldnn::reorder_mean_mode::subtract, + inputInfo->name())); break; } default: IE_THROW(Unexpected) << "Invalid mean variant in input " + inputName; break; } - p.primitivesToIRLayersMap[preprocessPrimID] = { inputInfo->name() }; - p.primitivesToIRLayersMap[y_name] = { inputInfo->name() }; - p.primitivesToIRLayersMap[uv_name] = { inputInfo->name() }; p.profilingIDs.push_back(preprocessPrimID); p.InitProfileInfo(preprocessPrimID, "Reorder"); p.primitiveIDs[inputName] = preprocessPrimID; // If it is batched blob, it will be overwritten afterwards. @@ -228,7 +237,7 @@ void CreateParameterOp(Program& p, const std::shared_ptr 1) { auto concatPrimID = "concat:" + inputName + Program::m_preProcessTag; - p.AddPrimitive(cldnn::concatenation(concatPrimID, reorders, cldnn::concatenation::along_b)); + p.AddPrimitive(cldnn::concatenation(concatPrimID, reorders, cldnn::concatenation::along_b, op->get_friendly_name())); p.primitiveIDs[inputName] = concatPrimID; } } else { @@ -237,20 +246,26 @@ void CreateParameterOp(Program& p, const std::shared_ptrname(), inputLayout }); - p.AddPrimitive(cldnn::input_layout(inputName, inputLayout)); - p.primitivesToIRLayersMap[inputName] = { inputInfo->name() }; + p.AddPrimitive(cldnn::input_layout(inputName, inputLayout, inputInfo->name())); switch (preProcess.getMeanVariant()) { case NONE: case MEAN_VALUE: { - p.AddPrimitive(cldnn::reorder(preprocessPrimID, inputName, networkInputLayout, meanValues)); + p.AddPrimitive(cldnn::reorder(preprocessPrimID, + inputName, + networkInputLayout, + meanValues, + cldnn::reorder_mean_mode::subtract, + op->get_friendly_name())); break; } case MEAN_IMAGE: { p.AddPrimitive(cldnn::reorder(preprocessPrimID, - inputName, - networkInputLayout, - meanBlobID)); + inputName, + networkInputLayout, + meanBlobID, + cldnn::reorder_mean_mode::subtract, + op->get_friendly_name())); break; } default: IE_THROW() << "Invalid mean variant in input " << inputName; diff --git a/inference-engine/src/cldnn_engine/ops/pooling.cpp b/inference-engine/src/cldnn_engine/ops/pooling.cpp index f1bf6952292..1e0db7bd204 100644 --- a/inference-engine/src/cldnn_engine/ops/pooling.cpp +++ b/inference-engine/src/cldnn_engine/ops/pooling.cpp @@ -70,7 +70,8 @@ void CreateAvgPoolOp(Program& p, const std::shared_ptr& params.stride, params.pad_begin, CldnnTensorFromIEDims(op->get_output_shape(0)), - DataTypeFromPrecision(op->get_output_element_type(0))); + DataTypeFromPrecision(op->get_output_element_type(0)), + op->get_friendly_name()); poolPrim.pad_end = params.pad_end; p.AddPrimitive(poolPrim); p.AddPrimitiveToProfiler(op); @@ -89,7 +90,8 @@ void CreateMaxPoolOp(Program& p, const std::shared_ptr& params.stride, params.pad_begin, CldnnTensorFromIEDims(op->get_output_shape(0)), - DataTypeFromPrecision(op->get_output_element_type(0))); + DataTypeFromPrecision(op->get_output_element_type(0)), + op->get_friendly_name()); poolPrim.pad_end = params.pad_end; p.AddPrimitive(poolPrim); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/prior_box.cpp b/inference-engine/src/cldnn_engine/ops/prior_box.cpp index 6cf0aaa6535..43eb5a69941 100644 --- a/inference-engine/src/cldnn_engine/ops/prior_box.cpp +++ b/inference-engine/src/cldnn_engine/ops/prior_box.cpp @@ -54,7 +54,8 @@ void CreatePriorBoxClusteredOp(Program& p, const std::shared_ptrget_output_element_type(0))); + DataTypeFromPrecision(op->get_output_element_type(0)), + op->get_friendly_name()); p.AddPrimitive(priorBoxPrim); p.AddPrimitiveToProfiler(op); @@ -103,7 +104,8 @@ void CreatePriorBoxOp(Program& p, const std::shared_ptrget_friendly_name()); p.AddPrimitive(priorBoxPrim); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/proposal.cpp b/inference-engine/src/cldnn_engine/ops/proposal.cpp index d5b906e5e6e..34677bd82ab 100644 --- a/inference-engine/src/cldnn_engine/ops/proposal.cpp +++ b/inference-engine/src/cldnn_engine/ops/proposal.cpp @@ -65,8 +65,9 @@ void CreateProposalOp(Program& p, const std::shared_ptrget_friendly_name() }; + auto argmax_mutable_prim = cldnn::mutable_data(proposal_mutable_id_w, + shared_memory, + op->get_friendly_name()); p.primitiveIDs[proposal_mutable_id_w] = proposal_mutable_id_w; p.AddPrimitive(argmax_mutable_prim); inputPrimitives.push_back(proposal_mutable_id_w); @@ -96,13 +97,16 @@ void CreateProposalOp(Program& p, const std::shared_ptrget_friendly_name()); p.AddPrimitive(proposalPrim); cldnn::primitive_id proposal_mutable_id_r = layer_type_name_ID(op) + ".1"; - auto argmax_mutable_prim_r = cldnn::mutable_data(proposal_mutable_id_r, { proposalLayerName }, shared_memory); - p.primitivesToIRLayersMap[proposal_mutable_id_r] = { op->get_friendly_name() }; + auto argmax_mutable_prim_r = cldnn::mutable_data(proposal_mutable_id_r, + { proposalLayerName }, + shared_memory, + op->get_friendly_name()); p.primitiveIDs[proposal_mutable_id_r] = proposal_mutable_id_r; p.AddPrimitive(argmax_mutable_prim_r); @@ -134,7 +138,8 @@ void CreateProposalOp(Program& p, const std::shared_ptrget_friendly_name()); p.AddPrimitive(proposalPrim); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/reduce.cpp b/inference-engine/src/cldnn_engine/ops/reduce.cpp index b336a2e78fa..47a54c70299 100644 --- a/inference-engine/src/cldnn_engine/ops/reduce.cpp +++ b/inference-engine/src/cldnn_engine/ops/reduce.cpp @@ -75,7 +75,8 @@ void CreateReduceOp(Program& p, const std::shared_ptr& op, cldnn:: inputPrimitives[0], mode, axes, - static_cast(keep_dims)); + static_cast(keep_dims), + op->get_friendly_name()); p.AddPrimitive(reducePrim); @@ -96,7 +97,7 @@ void CreateReduceOp(Program& p, const std::shared_ptr& op, cldnn:: outTensor = cldnn::tensor(TensorValue(out_shape[0]), TensorValue(out_shape[1]), 1, TensorValue(out_shape[2])); } - auto reshape_prim = cldnn::reshape(resultLayerName, layerName, outTensor); + auto reshape_prim = cldnn::reshape(resultLayerName, layerName, outTensor, op->get_friendly_name()); p.AddPrimitive(reshape_prim); p.AddPrimitiveToProfiler(op, resultLayerName); } @@ -112,7 +113,13 @@ void CreateReduceOp(Program& p, const std::shared_ptr& op, cldnn:: else if (rank - rawAxes.size() <= 4) out_format = cldnn::format::bfyx; - auto reorder_prim = cldnn::reorder(reorderLayerName, resultLayerName, out_format, out_dt); + auto reorder_prim = cldnn::reorder(reorderLayerName, + resultLayerName, + out_format, + out_dt, + std::vector(), + cldnn::reorder_mean_mode::subtract, + op->get_friendly_name()); p.AddPrimitive(reorder_prim); p.AddPrimitiveToProfiler(op, reorderLayerName); } else { diff --git a/inference-engine/src/cldnn_engine/ops/region_yolo.cpp b/inference-engine/src/cldnn_engine/ops/region_yolo.cpp index 348dd0f7eeb..314950027cb 100644 --- a/inference-engine/src/cldnn_engine/ops/region_yolo.cpp +++ b/inference-engine/src/cldnn_engine/ops/region_yolo.cpp @@ -28,7 +28,8 @@ void CreateRegionYoloOp(Program& p, const std::shared_ptrget_friendly_name()); p.AddPrimitive(regionPrim); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/reorg_yolo.cpp b/inference-engine/src/cldnn_engine/ops/reorg_yolo.cpp index 4a7f54cf810..9c47ccc9fb0 100644 --- a/inference-engine/src/cldnn_engine/ops/reorg_yolo.cpp +++ b/inference-engine/src/cldnn_engine/ops/reorg_yolo.cpp @@ -20,7 +20,8 @@ void CreateReorgYoloOp(Program& p, const std::shared_ptrget_friendly_name()); p.AddPrimitive(reorgPrim); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/reshape.cpp b/inference-engine/src/cldnn_engine/ops/reshape.cpp index f0084bb6a1c..a4978fbae29 100644 --- a/inference-engine/src/cldnn_engine/ops/reshape.cpp +++ b/inference-engine/src/cldnn_engine/ops/reshape.cpp @@ -36,9 +36,13 @@ void CreateCommonReshapeOp(Program& p, const std::shared_ptr& op) } cldnn::layout outputLayout(DataTypeFromPrecision(op->get_output_element_type(0)), outputFormat, outTensor); - p.AddPrimitive(cldnn::reorder(reorderId, reshapeInputId, outputLayout)); + p.AddPrimitive(cldnn::reorder(reorderId, + reshapeInputId, + outputLayout, + std::vector(), + cldnn::reorder_mean_mode::subtract, + op->get_friendly_name())); p.InitProfileInfo(reorderId, "Reorder", false, InferenceEngine::InferenceEngineProfileInfo::EXECUTED, layerName); - p.primitivesToIRLayersMap[reorderId] = { op->get_friendly_name() }; p.primitiveIDs[layerName + "_reorder"] = reorderId; p.primitiveIDs[reorderId] = reorderId; p.profilingIDs.push_back(reorderId); @@ -47,7 +51,8 @@ void CreateCommonReshapeOp(Program& p, const std::shared_ptr& op) auto reshapePrim = cldnn::reshape(layerName, reshapeInputId, - outTensor); + outTensor, + op->get_friendly_name()); p.AddPrimitive(reshapePrim); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/result.cpp b/inference-engine/src/cldnn_engine/ops/result.cpp index fe0d0f05658..c1219ad8fba 100644 --- a/inference-engine/src/cldnn_engine/ops/result.cpp +++ b/inference-engine/src/cldnn_engine/ops/result.cpp @@ -56,9 +56,12 @@ void CreateResultOp(Program& p, const std::shared_ptr& o std::string outputID = inputs[0]; p.AddPrimitive(cldnn::reorder(outLayerName, - outputID, - FormatFromLayout(outputData->getLayout()), - DataTypeFromPrecision(precision))); + outputID, + FormatFromLayout(outputData->getLayout()), + DataTypeFromPrecision(precision), + std::vector(), + cldnn::reorder_mean_mode::subtract, + op->get_friendly_name())); p.InitProfileInfo(outLayerName, "reorder"); p.profilingIDs.push_back(outLayerName); p.primitiveIDs[outLayerName] = outLayerName; diff --git a/inference-engine/src/cldnn_engine/ops/reverse_sequence.cpp b/inference-engine/src/cldnn_engine/ops/reverse_sequence.cpp index 766bbc89a31..6421a01dc75 100644 --- a/inference-engine/src/cldnn_engine/ops/reverse_sequence.cpp +++ b/inference-engine/src/cldnn_engine/ops/reverse_sequence.cpp @@ -22,7 +22,8 @@ void CreateReverseSequenceOp(Program& p, const std::shared_ptrget_friendly_name()); p.AddPrimitive(reverseSequencePrim); p.AddPrimitiveToProfiler(op); diff --git a/inference-engine/src/cldnn_engine/ops/rnn.cpp b/inference-engine/src/cldnn_engine/ops/rnn.cpp index 2d4705f1a91..1ebaa0a7868 100644 --- a/inference-engine/src/cldnn_engine/ops/rnn.cpp +++ b/inference-engine/src/cldnn_engine/ops/rnn.cpp @@ -107,8 +107,13 @@ void CreateLSTMCellOp(Program& p, const std::shared_ptrget_friendly_name())); + p.AddPrimitive(cldnn::reorder(permuteID, + inReshapeID, + inputLayout, + std::vector(), + cldnn::reorder_mean_mode::subtract, + op->get_friendly_name())); p.AddInnerPrimitiveToProfiler(inReshapeID, op->get_friendly_name(), op); p.AddInnerPrimitiveToProfiler(permuteID, op->get_friendly_name(), op); @@ -117,11 +122,24 @@ void CreateLSTMCellOp(Program& p, const std::shared_ptrget_friendly_name())); + p.AddPrimitive(cldnn::reorder(hiddenInStr, + hiddenInResh, + hiddenLayout, + std::vector(), + cldnn::reorder_mean_mode::subtract, + op->get_friendly_name())); + p.AddPrimitive(cldnn::reshape(cellInResh, inputPrimitives[2], inStateShape, op->get_friendly_name())); + p.AddPrimitive(cldnn::reorder(cellInStr, + cellInResh, + hiddenLayout, + std::vector(), + cldnn::reorder_mean_mode::subtract, + op->get_friendly_name())); + p.AddPrimitive(cldnn::concatenation(input_concatID, + { permuteID, hiddenInStr }, + cldnn::concatenation::concatenation_axis::along_x, + op->get_friendly_name())); p.AddInnerPrimitiveToProfiler(hiddenInResh, op->get_friendly_name(), op); p.AddInnerPrimitiveToProfiler(hiddenInStr, op->get_friendly_name(), op); @@ -139,14 +157,19 @@ void CreateLSTMCellOp(Program& p, const std::shared_ptrget_friendly_name())); p.AddInnerPrimitiveToProfiler(WRconcatID, op->get_friendly_name(), op); - p.AddPrimitive(cldnn::fully_connected(lstm_fc_id, input_concatID, WRconcatID, hasBias ? biasID : "")); - p.AddPrimitive(cldnn::reshape(gemmReshapeID, lstm_fc_id, gemmSz)); - p.AddPrimitive(cldnn::reorder(gemmReorderID, gemmReshapeID, gemmLayout)); - p.AddPrimitive(cldnn::lstm_elt(lstm_elt_id, gemmReorderID, cellInStr, - clip, 0, activations, activation_params, cldnn::lstm_weights_order::fizo)); + p.AddPrimitive(cldnn::fully_connected(lstm_fc_id, input_concatID, WRconcatID, hasBias ? biasID : "", op->get_friendly_name())); + p.AddPrimitive(cldnn::reshape(gemmReshapeID, lstm_fc_id, gemmSz, op->get_friendly_name())); + p.AddPrimitive(cldnn::reorder(gemmReorderID, + gemmReshapeID, + gemmLayout, + std::vector(), + cldnn::reorder_mean_mode::subtract, + op->get_friendly_name())); + p.AddPrimitive(cldnn::lstm_elt(lstm_elt_id, gemmReorderID, cellInStr, clip, 0, activations, + activation_params, cldnn::lstm_weights_order::fizo, 0, op->get_friendly_name())); p.AddInnerPrimitiveToProfiler(lstm_fc_id, op->get_friendly_name(), op); p.AddInnerPrimitiveToProfiler(gemmReshapeID, op->get_friendly_name(), op); @@ -156,16 +179,16 @@ void CreateLSTMCellOp(Program& p, const std::shared_ptrget_friendly_name())); p.AddInnerPrimitiveToProfiler(outputHiddenCropID, op->get_friendly_name(), op); - p.AddPrimitive(cldnn::reshape(outputHiddenID, outputHiddenCropID, outSz)); + p.AddPrimitive(cldnn::reshape(outputHiddenID, outputHiddenCropID, outSz, op->get_friendly_name())); p.AddInnerPrimitiveToProfiler(outputHiddenID, op->get_friendly_name(), op); cldnn::primitive_id outputCellCropID = layerName + "_cc"; cldnn::primitive_id outputCellID = layerName + ".1"; - p.AddPrimitive(cldnn::crop(outputCellCropID, lstm_elt_id, hiddenSz, cellCropSz)); + p.AddPrimitive(cldnn::crop(outputCellCropID, lstm_elt_id, hiddenSz, cellCropSz, op->get_friendly_name())); p.AddInnerPrimitiveToProfiler(outputCellCropID, op->get_friendly_name(), op); - p.AddPrimitive(cldnn::reshape(outputCellID, outputCellCropID, outSz)); + p.AddPrimitive(cldnn::reshape(outputCellID, outputCellCropID, outSz, op->get_friendly_name())); p.AddInnerPrimitiveToProfiler(outputCellID, op->get_friendly_name(), op); // output primitive IDs @@ -223,11 +246,16 @@ void CreateLSTMSequenceOp(Program& p, const std::shared_ptrget_friendly_name())); + p.AddPrimitive(cldnn::reorder(permuteID, + inReshapeID, + inputLayout, + std::vector(), + cldnn::reorder_mean_mode::subtract, + op->get_friendly_name())); - p.AddPrimitive(cldnn::reshape(inHiddenStateID, inputPrimitives[1], inStateShape)); - p.AddPrimitive(cldnn::reshape(inCellStateID, inputPrimitives[2], inStateShape)); + p.AddPrimitive(cldnn::reshape(inHiddenStateID, inputPrimitives[1], inStateShape, op->get_friendly_name())); + p.AddPrimitive(cldnn::reshape(inCellStateID, inputPrimitives[2], inStateShape, op->get_friendly_name())); p.AddInnerPrimitiveToProfiler(inReshapeID, op->get_friendly_name(), op); p.AddInnerPrimitiveToProfiler(permuteID, op->get_friendly_name(), op); @@ -243,12 +271,12 @@ void CreateLSTMSequenceOp(Program& p, const std::shared_ptrget_friendly_name())); p.AddInnerPrimitiveToProfiler(WRconcatID, op->get_friendly_name(), op); std::vector WRreshapeSize = { 4 * size_t(lstm_hidden_size), size_t(lstm_input_size + lstm_hidden_size) }; cldnn::primitive_id WRreshapeID = WRconcatID + "_reshape"; - auto reshapeInPrim = cldnn::reshape(WRreshapeID, WRconcatID, CldnnTensorFromIEDims(WRreshapeSize)); + auto reshapeInPrim = cldnn::reshape(WRreshapeID, WRconcatID, CldnnTensorFromIEDims(WRreshapeSize), op->get_friendly_name()); p.AddPrimitive(reshapeInPrim); p.AddInnerPrimitiveToProfiler(WRreshapeID, op->get_friendly_name(), op); @@ -267,30 +295,35 @@ void CreateLSTMSequenceOp(Program& p, const std::shared_ptr(seqIdx), 0, 0 }; cldnn::primitive_id inputCrop_id = inputCropID + ":" + seqIdx_str; - p.AddPrimitive(cldnn::crop(inputCrop_id, permuteID, crop_tensor, offset_tensor)); + p.AddPrimitive(cldnn::crop(inputCrop_id, permuteID, crop_tensor, offset_tensor, op->get_friendly_name())); p.AddInnerPrimitiveToProfiler(inputCrop_id, op->get_friendly_name(), op); - p.AddPrimitive(cldnn::concatenation(concatID, { inputCrop_id, hiddenStr }, cldnn::concatenation::concatenation_axis::along_x)); + p.AddPrimitive(cldnn::concatenation(concatID, { inputCrop_id, hiddenStr }, cldnn::concatenation::concatenation_axis::along_x, op->get_friendly_name())); p.AddInnerPrimitiveToProfiler(concatID, op->get_friendly_name(), op); - p.AddPrimitive(cldnn::fully_connected(lstm_fc_id, concatID, WRreshapeID, biasID)); + p.AddPrimitive(cldnn::fully_connected(lstm_fc_id, concatID, WRreshapeID, biasID, op->get_friendly_name())); p.AddInnerPrimitiveToProfiler(lstm_fc_id, op->get_friendly_name(), op); - p.AddPrimitive(cldnn::reshape(lstm_fc_resh_id, lstm_fc_id, gemmSz)); - p.AddPrimitive(cldnn::reorder(lstm_fc_reor_id, lstm_fc_resh_id, gemmLayout)); - p.AddPrimitive(cldnn::lstm_elt(lstm_elt_id, lstm_fc_reor_id, cellStr, - clip, 0, activations, activation_params, cldnn::lstm_weights_order::fizo)); + p.AddPrimitive(cldnn::reshape(lstm_fc_resh_id, lstm_fc_id, gemmSz, op->get_friendly_name())); + p.AddPrimitive(cldnn::reorder(lstm_fc_reor_id, + lstm_fc_resh_id, + gemmLayout, + std::vector(), + cldnn::reorder_mean_mode::subtract, + op->get_friendly_name())); + p.AddPrimitive(cldnn::lstm_elt(lstm_elt_id, lstm_fc_reor_id, cellStr, clip, 0, activations, + activation_params, cldnn::lstm_weights_order::fizo, 0, op->get_friendly_name())); p.AddInnerPrimitiveToProfiler(lstm_fc_resh_id, op->get_friendly_name(), op); p.AddInnerPrimitiveToProfiler(lstm_fc_reor_id, op->get_friendly_name(), op); p.AddInnerPrimitiveToProfiler(lstm_elt_id, op->get_friendly_name(), op); hiddenStr = crop_id + ":hidden"; cellStr = crop_id + ":cell"; - p.AddPrimitive(cldnn::crop(hiddenStr, lstm_elt_id, hiddenSz, cldnn::tensor{ 0, 0, 0, 0 })); + p.AddPrimitive(cldnn::crop(hiddenStr, lstm_elt_id, hiddenSz, cldnn::tensor{ 0, 0, 0, 0 }, op->get_friendly_name())); p.AddInnerPrimitiveToProfiler(hiddenStr, op->get_friendly_name(), op); output_ids_offsets.push_back(hiddenStr); if (i < lstm_sequence_len - 1) { - p.AddPrimitive(cldnn::crop(cellStr, lstm_elt_id, hiddenSz, cellCropSz)); + p.AddPrimitive(cldnn::crop(cellStr, lstm_elt_id, hiddenSz, cellCropSz, op->get_friendly_name())); p.AddInnerPrimitiveToProfiler(cellStr, op->get_friendly_name(), op); } else { // last hidden state crop (output 2) @@ -299,7 +332,7 @@ void CreateLSTMSequenceOp(Program& p, const std::shared_ptrget_friendly_name())); cldnn::primitive_id outputCellID = layerName + ".2"; p.AddInnerPrimitiveToProfiler(cellStr, op->get_friendly_name(), op); p.primitiveIDs[outputCellID] = cellStr; @@ -310,7 +343,7 @@ void CreateLSTMSequenceOp(Program& p, const std::shared_ptr