Implement CTPUT in AUTO code logic (#16220)

* Implement CTPUT in AUTO code logic * Add logic to handle device loading failure * add some code comments * fix warnning conversion from size_t to int * Updated code according to comments of bell and wanglei * the preferred device code path need to be updated with ctput also * add fallback logic for CTPUT * Modify the code logic according to bell suggestion * Add prints for debugging bug * throw exception when no device to run pipline task * initialize idleWorkerRequest for CTPUT * fix getting properties Signed-off-by: fishbell <bell.song@intel.com> refine Signed-off-by: fishbell <bell.song@intel.com> * fix warning Signed-off-by: fishbell <bell.song@intel.com> * fix illegal character on windows Signed-off-by: fishbell <bell.song@intel.com> * fix illegal character Signed-off-by: fishbell <bell.song@intel.com> add missing include Signed-off-by: fishbell <bell.song@intel.com> * more code refine Signed-off-by: fishbell <bell.song@intel.com> --------- Signed-off-by: fishbell <bell.song@intel.com> Co-authored-by: fishbell <bell.song@intel.com>
2023-03-26 12:35:26 +08:00 · 2023-03-26 12:35:26 +08:00 · 60ab7490bf
commit 60ab7490bf
parent e66b837104
8 changed files with 432 additions and 192 deletions
--- a/src/plugins/auto/auto_executable_network.cpp
+++ b/src/plugins/auto/auto_executable_network.cpp
@ -64,9 +64,8 @@ IE::Parameter AutoExecutableNetwork::GetMetric(const std::string& name) const {
        return decltype(ov::device::priorities)::value_type {value->second.as<std::string>()};
    } else if (name == ov::device::properties) {
        ov::AnyMap all_devices = {};
-        if (_autoSchedule->_loadContext[ACTUALDEVICE].isAlready) {
+        auto get_device_supported_metrics = [&all_devices] (const AutoLoadContext& context) {
             ov::AnyMap device_properties = {};
-            auto& context = _autoSchedule->_loadContext[ACTUALDEVICE];
            auto device_supported_metrics = context.executableNetwork->GetMetric(METRIC_KEY(SUPPORTED_METRICS));
            for (auto&& property_name : device_supported_metrics.as<std::vector<std::string>>()) {
                device_properties[property_name] = context.executableNetwork->GetMetric(property_name);
@ -76,6 +75,26 @@ IE::Parameter AutoExecutableNetwork::GetMetric(const std::string& name) const {
                device_properties[property_name] = context.executableNetwork->GetConfig(property_name);
            }
            all_devices[context.deviceInfo.deviceName] = device_properties;
+        };
+        if (_autoSchedule->_pCTPUTLoadContext) {
+            // need lock for inference failure
+            std::lock_guard<std::mutex> lock(_autoSContext->_fallbackMutex);
+            auto load_count = _autoSContext->_devicePriorities.size();
+            for (size_t i = 0; i < load_count; i++)
+                get_device_supported_metrics(_autoSchedule->_pCTPUTLoadContext[i]);
+        } else {
+            {
+                std::lock_guard<std::mutex> lock(_autoSContext->_fallbackMutex);
+                if (_autoSchedule->_loadContext[FALLBACKDEVICE].isAlready) {
+                    get_device_supported_metrics(_autoSchedule->_loadContext[FALLBACKDEVICE]);
+                }
+            }
+            std::lock_guard<std::mutex> lock(_autoSContext->_confMutex);
+            if (_autoSchedule->_loadContext[ACTUALDEVICE].isAlready) {
+                get_device_supported_metrics(_autoSchedule->_loadContext[ACTUALDEVICE]);
+            } else {
+                get_device_supported_metrics(_autoSchedule->_loadContext[CPU]);
+            }
        }
        return all_devices;
    } else if (name == ov::hint::model_priority) {
@ -91,6 +110,24 @@ IE::Parameter AutoExecutableNetwork::GetMetric(const std::string& name) const {
        const unsigned int defaultNumForTPUT = 4u;
        const unsigned int defaultNumForLatency = 1u;
        unsigned int real = 0;
+        if (_autoSchedule->_pCTPUTLoadContext) {
+            std::lock_guard<std::mutex> lock(_autoSContext->_fallbackMutex);
+            unsigned int res = 0u;
+            auto load_count = _autoSContext->_devicePriorities.size();
+            for (size_t i = 0; i < load_count; i++) {
+                try {
+                    res += (_autoSchedule->_pCTPUTLoadContext[i]).executableNetwork->GetMetric(
+                        METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS)).as<unsigned int>();
+                } catch (const IE::Exception& iie) {
+                    IE_THROW()
+                        << "Every device used in cumulative mode should "
+                            << "support OPTIMAL_NUMBER_OF_INFER_REQUESTS ExecutableNetwork metric. "
+                            << "Failed to query the metric for with error:" <<
+                            iie.what();
+                }
+            }
+            return decltype(ov::optimal_number_of_infer_requests)::value_type {res};
+        }
        if (_autoSchedule->_loadContext[ACTUALDEVICE].isAlready) {
            real = _autoSchedule->_loadContext[ACTUALDEVICE].
                executableNetwork->GetMetric(name).as<unsigned int>();
@ -181,12 +218,13 @@ IE::Parameter AutoExecutableNetwork::GetMetric(const std::string& name) const {
            exeDevices.push_back(ExeDevicesString);
            execution_devices = decltype(ov::execution_devices)::value_type {exeDevices};
        };
-        if (_autoSContext->_performanceHint == IE::PluginConfigParams::CUMULATIVE_THROUGHPUT) {
-            try {
-                execution_devices = _autoSchedule->_loadContext[ACTUALDEVICE].executableNetwork->GetMetric(name);
-            } catch(const IE::Exception&) {
-                GetExecutionDevices(_autoSchedule->_loadContext[ACTUALDEVICE].workName);
+        if (_autoSchedule->_pCTPUTLoadContext) {
+            std::vector<std::string> exeDevices = {};
+            std::lock_guard<std::mutex> lock(_autoSContext->_confMutex);
+            for (auto n : _autoSContext->_devicePriorities) {
+                exeDevices.push_back(n.deviceName);
            }
+            execution_devices = decltype(ov::execution_devices)::value_type {exeDevices};
        } else {
            std::lock_guard<std::mutex> lock(_autoSContext->_confMutex);
            for (int i = 0; i < CONTEXTNUM; i++) {
@ -203,9 +241,13 @@ IE::Parameter AutoExecutableNetwork::GetMetric(const std::string& name) const {
        return execution_devices;
    } else if (name == ov::model_name) {
        std::lock_guard<std::mutex> lock(_autoSContext->_confMutex);
+        if (_autoSchedule->_pCTPUTLoadContext) {
+            return _autoSchedule->_pCTPUTLoadContext[0].executableNetwork->GetMetric(name);
+        } else {
            if (_autoSchedule->_loadContext[CPU].isEnabled && _autoSchedule->_loadContext[CPU].isAlready)
                return _autoSchedule->_loadContext[CPU].executableNetwork->GetMetric(name);
            return _autoSchedule->_loadContext[ACTUALDEVICE].executableNetwork->GetMetric(name);
+        }
    } else if (name == METRIC_KEY(SUPPORTED_METRICS)) {
        IE_SET_METRIC_RETURN(SUPPORTED_METRICS,
                             {METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS),
--- a/src/plugins/auto/auto_schedule.cpp
+++ b/src/plugins/auto/auto_schedule.cpp
@ -3,7 +3,6 @@
 //

 ///////////////////////////////////////////////////////////////////////////////////////////////////
-
 #include "auto_schedule.hpp"
 #include "async_infer_request.hpp"
 #include "auto_executable_network.hpp"
@ -183,6 +182,27 @@ bool AutoSchedule::selectOtherDevice(const std::string& currentDeviceName) {
                return getExecutionDevices(_loadContext[FALLBACKDEVICE].deviceInfo.deviceName.c_str());
            }
        };
+
+        auto removeInferFailDevice = [&](const std::string& deviceName) {
+            if (_autoSContext->_devicePriorities.size() > 1) {
+                const auto CurrentDeviceIter =
+                    std::find_if(_autoSContext->_devicePriorities.begin(),
+                                 _autoSContext->_devicePriorities.end(),
+                                 [=](const DeviceInformation& d) -> bool {
+                                     return d.deviceName.find(deviceName) != std::string::npos;
+                                 });
+                if (CurrentDeviceIter != _autoSContext->_devicePriorities.end()) {
+                    _autoSContext->_devicePriorities.erase(CurrentDeviceIter);
+                    return true;
+                }
+            }
+            return false;
+        };
+
+        if (_pCTPUTLoadContext) {
+            return removeInferFailDevice(currentDeviceName);
+        }
+
        return getExecutionDevices(currentDeviceName);
    }
 }
@ -217,30 +237,40 @@ void AutoSchedule::init(const ScheduleContext::Ptr& sContext) {
        std::list<DeviceInformation> validDevices =
            _autoSContext->_plugin->GetValidDevice(_autoSContext->_devicePriorities,
                                                   _loadContext[ACTUALDEVICE].networkPrecision);
+        // When the hint is ctput and there is only one device, the single-device logic is used
        if (validDevices.size() == 1) {
-            // When the hint is ctput and there is only one device, the single-device logic is used instead of
-            // the MULTI logic
-            // can not change _autoSContext->_performanceHint to THROUGHPUT, because GetMetric needs to return CTPUT
            _loadContext[ACTUALDEVICE].deviceInfo = validDevices.front();
            _loadContext[ACTUALDEVICE].deviceInfo.config[CONFIG_KEY(PERFORMANCE_HINT)] =
                IE::PluginConfigParams::THROUGHPUT;
-            isCumulative = false;
-        } else {
-            // When the hint is ctput and there are more than one device, the MULTI logic is used
-            std::string deviceName = "MULTI:";
+        } else if (validDevices.size() > 1) {
+            _loadContext[ACTUALDEVICE].isEnabled = false;
+            _autoSContext->_devicePriorities.clear();
+            std::copy(std::begin(validDevices),
+                      std::end(validDevices),
+                      std::back_inserter(_autoSContext->_devicePriorities));
+            // Total number of devices in CTPUT
+            auto nCTputDeviceNums = validDevices.size();
+            // Generate contexts for loading each device
+            _pCTPUTLoadContext.reset(new AutoLoadContext[nCTputDeviceNums]);
+            int idx = 0;
+            DeviceInformation cpuDeviceInformation;
            for (auto& device : validDevices) {
-                deviceName += device.deviceName;
-                deviceName += ((device.deviceName == validDevices.back().deviceName) ? "" : ",");
+                if (device.deviceName.find("CPU") == std::string::npos) {
+                    _pCTPUTLoadContext[idx].deviceInfo = device;
+                    _pCTPUTLoadContext[idx].deviceInfo.config[CONFIG_KEY(PERFORMANCE_HINT)] =
+                        IE::PluginConfigParams::THROUGHPUT;
+                    idx++;
+                } else {
+                    cpuDeviceInformation = device;
+                    cpuDeviceInformation.config.insert(
+                        {ov::affinity.name(), ov::Any(ov::Affinity::CORE).as<std::string>()});
+                }
+            }
+            if (!cpuDeviceInformation.deviceName.empty()) {
+                _pCTPUTLoadContext[idx].deviceInfo = cpuDeviceInformation;
+                _pCTPUTLoadContext[idx].deviceInfo.config[CONFIG_KEY(PERFORMANCE_HINT)] =
+                    IE::PluginConfigParams::THROUGHPUT;
            }
-            _loadContext[ACTUALDEVICE].deviceInfo.deviceName = deviceName;
-            _loadContext[ACTUALDEVICE].deviceInfo.config[CONFIG_KEY(PERFORMANCE_HINT)] =
-                InferenceEngine::PluginConfigParams::CUMULATIVE_THROUGHPUT;
-            _loadContext[ACTUALDEVICE].deviceInfo.config[CONFIG_KEY(PERF_COUNT)] =
-                _autoSContext->_needPerfCounters ? InferenceEngine::PluginConfigParams::YES
-                                                 : InferenceEngine::PluginConfigParams::NO;
-            if (_autoSContext->_bindBuffer)
-                _loadContext[ACTUALDEVICE].deviceInfo.config[ov::intel_auto::device_bind_buffer.name()] =
-                    InferenceEngine::PluginConfigParams::YES;
        }
    } else {
        _loadContext[ACTUALDEVICE].deviceInfo =
@ -248,15 +278,91 @@ void AutoSchedule::init(const ScheduleContext::Ptr& sContext) {
                                                 _loadContext[ACTUALDEVICE].networkPrecision,
                                                 _autoSContext->_modelPriority);
    }
+
+    auto loadDeviceTask = [&](AutoLoadContext* contextPtr,
+                              const std::string& modelPath,
+                              const IE::CNNNetwork& network,
+                              bool isCumulative) {
+        TryToLoadNetWork(*contextPtr, modelPath, network, isCumulative);
+        if (contextPtr->isLoadSuccess) {
+            if (contextPtr->workName.empty()) {
+                contextPtr->workName = contextPtr->deviceInfo.deviceName;
+            }
+            GenerateWorkers(contextPtr->workName, contextPtr->executableNetwork);
+            // need lock
+            {
+                std::lock_guard<std::mutex> lock(_autoSContext->_confMutex);
+                _autoSContext->_config.insert(contextPtr->deviceInfo.config.begin(),
+                                              contextPtr->deviceInfo.config.end());
+            }
+            contextPtr->isAlready = true;
+            // reloadsuccess flag only for _loadContext[FALLBACKDEVICE]
+            contextPtr->isReloadSuccess = true;
+            auto& deviceName = contextPtr->deviceInfo.deviceName;
+            LOG_INFO_TAG("device:%s loading Network finished", deviceName.c_str());
+            auto supported_config_keys = _autoSContext->_core->GetMetric(deviceName, METRIC_KEY(SUPPORTED_CONFIG_KEYS))
+                                             .as<std::vector<std::string>>();
+            DEBUG_RUN([this, &contextPtr, &deviceName, &supported_config_keys] {
+                std::lock_guard<std::mutex> lock(_autoSContext->_confMutex);
+                for (const auto& cfg : supported_config_keys) {
+                    try {
+                        LOG_DEBUG_TAG("device:%s, GetConfig:%s=%s",
+                                      deviceName.c_str(),
+                                      cfg.c_str(),
+                                      contextPtr->executableNetwork->GetConfig(cfg).as<std::string>().c_str());
+                    } catch (const IE::Exception&) {
+                    }
+                }
+            });
+        }
+        // Handle device load failure in case of ctput
+        if (isCumulative && !contextPtr->isLoadSuccess) {
+            std::string failedDeviceName = contextPtr->deviceInfo.deviceName;
+            std::lock_guard<std::mutex> lock(_autoSContext->_confMutex);
+            const auto DeviceIter =
+                std::find_if(_autoSContext->_devicePriorities.begin(),
+                             _autoSContext->_devicePriorities.end(),
+                             [&](const DeviceInformation& d) -> bool {
+                                 return d.deviceName.find(failedDeviceName) != std::string::npos;
+                             });
+            // Remove failed device from _devicePriorities
+            if (DeviceIter != _autoSContext->_devicePriorities.end()) {
+                _autoSContext->_devicePriorities.erase(DeviceIter);
+            }
+            // Remove failed device from ov::device::priorities in config
+            auto it_prior = _autoSContext->_config.find(ov::device::priorities.name());
+            if (it_prior != _autoSContext->_config.end()) {
+                auto priorities = it_prior->second.as<std::string>();
+                size_t nPos = priorities.find(failedDeviceName);
+                if (nPos != std::string::npos) {
+                    // If need to delete failed device and "," then length plus 1
+                    size_t nNameLen = (nPos + failedDeviceName.length()) == priorities.length()
+                                   ? failedDeviceName.length()
+                                   : failedDeviceName.length() + 1;
+                    priorities.erase(nPos, nNameLen);
+                    it_prior->second = priorities;
+                }
+            }
+        }
+        contextPtr->promise.set_value();
+        // the first load network process finished
+        std::call_once(_firstLoadOC, [this]() {
+            _firstLoadPromise.set_value();
+        });
+    };
+    if (_loadContext[ACTUALDEVICE].isEnabled) {
        LOG_INFO_TAG("select device:%s", _loadContext[ACTUALDEVICE].deviceInfo.deviceName.c_str());
-    bool isActualDevCPU =
-        _loadContext[ACTUALDEVICE].deviceInfo.deviceName.find("CPU") !=std::string::npos && !isCumulative;
-    // if Actual device is CPU or perf_hint is cumulative, disabled _loadContext[CPU], only use _loadContext[ACTUALDEVICE]
-    if (isActualDevCPU || isCumulative || !_autoSContext->_startupfallback) {
+        bool isActualDevCPU = _loadContext[ACTUALDEVICE].deviceInfo.deviceName.find("CPU") != std::string::npos;
+        // if Actual device is CPU or perf_hint is cumulative, disabled _loadContext[CPU], only use
+        // _loadContext[ACTUALDEVICE]
+        if (isActualDevCPU || !_autoSContext->_startupfallback) {
            _loadContext[CPU].isEnabled = false;
        } else {
-        const auto CPUIter = std::find_if(_autoSContext->_devicePriorities.begin(), _autoSContext->_devicePriorities.end(),
-                                          [=](const DeviceInformation& d) -> bool { return d.deviceName.find("CPU") != std::string::npos; });
+            const auto CPUIter = std::find_if(_autoSContext->_devicePriorities.begin(),
+                                              _autoSContext->_devicePriorities.end(),
+                                              [](const DeviceInformation& d) -> bool {
+                                                  return d.deviceName.find("CPU") != std::string::npos;
+                                              });
            // if have CPU Device,  enable _loadContext[CPU]
            if (CPUIter != _autoSContext->_devicePriorities.end()) {
                _loadContext[CPU].isEnabled = true;
@ -275,49 +381,24 @@ void AutoSchedule::init(const ScheduleContext::Ptr& sContext) {
                auto* contextPtr = &_loadContext[i];
                auto modelPath = _autoSContext->_modelPath;
                auto network = _autoSContext->_network;
-            _loadContext[i].task = [this, contextPtr, modelPath, network, isCumulative]() mutable {
-                TryToLoadNetWork(*contextPtr, modelPath, network);
-                if (contextPtr->isLoadSuccess) {
-                    if (contextPtr->workName.empty()) {
-                        contextPtr->workName = contextPtr->deviceInfo.deviceName;
-                    }
-                    if (!isCumulative)
-                        GenerateWorkers(contextPtr->workName, contextPtr->executableNetwork);
-                    //need lock
-                    {
-                        std::lock_guard<std::mutex> lock(_autoSContext->_confMutex);
-                        _autoSContext->_config.insert(contextPtr->deviceInfo.config.begin(), contextPtr->deviceInfo.config.end());
-                    }
-                    contextPtr->isAlready = true;
-                    // reloadsuccess flag only for _loadContext[FALLBACKDEVICE]
-                    contextPtr->isReloadSuccess = true;
-                    auto& deviceName = contextPtr->deviceInfo.deviceName;
-                    LOG_INFO_TAG("device:%s loading Network finished", deviceName.c_str());
-                    if (!isCumulative) {
-                        auto supported_config_keys =
-                            _autoSContext->_core->GetMetric(deviceName, METRIC_KEY(SUPPORTED_CONFIG_KEYS))
-                                          .as<std::vector<std::string>>();
-                        DEBUG_RUN([this, &contextPtr, &deviceName, &supported_config_keys] {
-                            std::lock_guard<std::mutex> lock(_autoSContext->_confMutex);
-                            for (const auto& cfg : supported_config_keys) {
-                                try {
-                                    LOG_DEBUG_TAG(
-                                        "device:%s, GetConfig:%s=%s",
-                                        deviceName.c_str(),
-                                        cfg.c_str(),
-                                        contextPtr->executableNetwork->GetConfig(cfg).as<std::string>().c_str());
-                                } catch (const IE::Exception&) {
+                _loadContext[i].task = std::bind(loadDeviceTask, contextPtr, modelPath, network, isCumulative);
            }
        }
-                        });
    }
+    std::vector<Task> otherDevicesloads;
+    std::vector<Task> cpuLoads;
+    if (_pCTPUTLoadContext) {
+        for (size_t i = 0; i < _autoSContext->_devicePriorities.size(); i++) {
+            auto* contextPtr = &_pCTPUTLoadContext[i];
+            auto modelPath = _autoSContext->_modelPath;
+            auto network = _autoSContext->_network;
+            _pCTPUTLoadContext[i].task = std::bind(loadDeviceTask, contextPtr, modelPath, network, isCumulative);
+            if (i == _autoSContext->_devicePriorities.size() - 1 &&
+                _pCTPUTLoadContext[i].deviceInfo.deviceName.find("CPU") != std::string::npos) {
+                cpuLoads.push_back(_pCTPUTLoadContext[i].task);
+            } else {
+                otherDevicesloads.push_back(_pCTPUTLoadContext[i].task);
            }
-                contextPtr->promise.set_value();
-                // the first load network process finished
-                std::call_once(_firstLoadOC, [this]() {
-                    _firstLoadPromise.set_value();
-                });
-            };
        }
    }
    OV_ITT_SCOPED_TASK(itt::domains::MULTIPlugin,
@ -350,7 +431,6 @@ void AutoSchedule::init(const ScheduleContext::Ptr& sContext) {
                _loadContext[CPU].future.wait();
                // clean up helper infer requests
                // first, wait for all the remaining requests to finish
-                if (!_autoSContext->_runtimeFallback) {
                for (auto& iter : _workerRequests["CPU_HELP"]) {
                    try {
                        iter._inferRequest._ptr->Wait(IE::InferRequest::WaitMode::RESULT_READY);
@ -358,7 +438,6 @@ void AutoSchedule::init(const ScheduleContext::Ptr& sContext) {
                        LOG_DEBUG_TAG("No infer results expected, infer in CPU_HELP throw some errors: %s", iie.what());
                    }
                }
-                }
                // late enough to check the idle queue now
                // second, check the idle queue if all requests are in place
                size_t destroynum = 0;
@ -410,15 +489,39 @@ void AutoSchedule::init(const ScheduleContext::Ptr& sContext) {
            _inferPipelineTasksDeviceSpecific[device.deviceName] = nullptr;
        }
        _loadContext[ACTUALDEVICE].task();
+    } else {
+        if (_pCTPUTLoadContext) {
+            for (auto&& device : _autoSContext->_devicePriorities) {
+                // initialize containers before run async task, if not initialized, it will hang during infer
+                _idleWorkerRequests[device.deviceName];
+                _workerRequests[device.deviceName];
+                _inferPipelineTasksDeviceSpecific[device.deviceName] = nullptr;
+            }
+            _executor = _autoSContext->_plugin->executorManager()->getIdleCPUStreamsExecutor(IStreamsExecutor::Config{
+                "CTPUTDeviceAsyncLoad",
+                static_cast<int>(std::thread::hardware_concurrency()) /* max possible #streams*/,
+                0 /*default threads per stream, workaround for ticket 62376*/,
+                IStreamsExecutor::ThreadBindingType::NONE});
+            // load devices other than CPU first
+            if (otherDevicesloads.size() > 0) {
+                // Wait for the devices other than CPU to load the network
+                _executor->runAndWait(otherDevicesloads);
+            }
+            // Finally load the CPU
+            if (cpuLoads.size() > 0) {
+                // Wait for CPU to load the network
+                _executor->runAndWait(cpuLoads);
+            }
        } else {
            // only one device need to load network, do not need to load it async
            _loadContext[ACTUALDEVICE].task();
            _passthroughExeNet = _loadContext[ACTUALDEVICE].executableNetwork;
        }
+    }
    WaitFirstNetworkReady();
 }

-void AutoSchedule::TryToLoadNetWork(AutoLoadContext& context, const std::string& modelPath, const IE::CNNNetwork& network) {
+void AutoSchedule::TryToLoadNetWork(AutoLoadContext& context, const std::string& modelPath, const IE::CNNNetwork& network, bool isCumulative) {
    auto& device = context.deviceInfo.deviceName;
    auto& deviceConfig = context.deviceInfo.config;
    auto& deviceList = context.metaDevices;
@ -458,7 +561,7 @@ void AutoSchedule::TryToLoadNetWork(AutoLoadContext& context, const std::string&
        context.errMessage += device + ":" + e.what();
        context.isLoadSuccess = false;
    }
-    if (context.isLoadSuccess || curDevIsCPU) {
+    if (context.isLoadSuccess || curDevIsCPU || isCumulative) {
        return;
    }
    // need to reload network, unregister it's priority
@ -512,7 +615,7 @@ void AutoSchedule::TryToLoadNetWork(AutoLoadContext& context, const std::string&
    }
    LOG_DEBUG_TAG("try to load %s", context.deviceInfo.deviceName.c_str());
    // try to load this candidate device
-    TryToLoadNetWork(context, modelPath, network);
+    TryToLoadNetWork(context, modelPath, network, isCumulative);
 }

 void AutoSchedule::WaitFirstNetworkReady() {
@ -542,6 +645,20 @@ void AutoSchedule::WaitFirstNetworkReady() {
            LOG_ERROR_TAG("load failed, %s", _loadContext[i].errMessage.c_str());
        }
    }
+    // devices loaded successfully in CTPUT
+    if (_pCTPUTLoadContext) {
+        int nLoadSucNums = 0;
+        for (size_t i = 0; i < _autoSContext->_devicePriorities.size(); i++) {
+            // check if device loaded successfully
+            if (_pCTPUTLoadContext[i].isAlready) {
+                nLoadSucNums++;
+            }
+        }
+        // one or more devices loaded successfully
+        if (nLoadSucNums > 0) {
+            return;
+        }
+    }
    IE_THROW() << GetLogTag() << "load all devices failed";
 }

@ -560,6 +677,10 @@ bool AutoSchedule::ScheduleToWorkerInferRequest(IE::Task inferPipelineTask, Devi
    std::vector<DeviceInformation> devices;
    // AUTO work mode
    if (!preferred_device.empty()) {
+        if (_pCTPUTLoadContext) {
+            std::lock_guard<std::mutex> lock(_autoSContext->_fallbackMutex);
+            devices = _autoSContext->_devicePriorities;
+        } else {
            // if the device needed by customer is not ready, need to wait for it
            WaitActualNetworkReady();
            // the preferred_device should be the selected device in AUTO work mode
@ -567,6 +688,14 @@ bool AutoSchedule::ScheduleToWorkerInferRequest(IE::Task inferPipelineTask, Devi
                IE_THROW(NotFound) << "The preferred device should be the selected device";
            }
            devices.push_back(_loadContext[ACTUALDEVICE].deviceInfo);
+        }
+    } else {
+        if (_pCTPUTLoadContext) {
+            // Devices that fail infer will be removed from the priority list in the callback, need lock here
+            std::lock_guard<std::mutex> lock(_autoSContext->_fallbackMutex);
+            for (size_t i = 0; i < _autoSContext->_devicePriorities.size(); i++) {
+                devices.push_back(_autoSContext->_devicePriorities[i]);
+            }
        } else {
            // _acceleratorDevice could be the same as _cpuDevice, such as AUTO:CPU
            if (_loadContext[FALLBACKDEVICE].isAlready) {
@ -583,6 +712,10 @@ bool AutoSchedule::ScheduleToWorkerInferRequest(IE::Task inferPipelineTask, Devi
                }
            }
        }
+    }
+    if (devices.size() == 0) {
+        IE_THROW(GeneralError) << "No device to run pipeline task";
+    }
    for (auto&& device : devices) {
        if (!preferred_device.empty() && (device.deviceName != preferred_device)) {
            continue;
@ -644,27 +777,12 @@ IInferPtr AutoSchedule::CreateInferRequest() {
    if (!syncRequestImpl)
        syncRequestImpl = CreateInferRequestImpl(execNetwork->_networkInputs, execNetwork->_networkOutputs);
    syncRequestImpl->setPointerToExecutableNetworkInternal(execNetwork);
-    bool isCumulative = (_autoSContext->_performanceHint == IE::PluginConfigParams::CUMULATIVE_THROUGHPUT) ? true : false;
-    bool isCTPUTSingleDevice =
-        isCumulative && _loadContext[ACTUALDEVICE].deviceInfo.deviceName.find("MULTI:") == std::string::npos ? true
-                                                                                                            : false;
-    if ((_passthroughExeNet && !isCumulative) || isCTPUTSingleDevice) {
-        std::string perfmode;
-        try {
-            perfmode = _passthroughExeNet->GetConfig(
-                                CONFIG_KEY(PERFORMANCE_HINT)).as<std::string>();
-        } catch (const IE::Exception&) {
-            LOG_INFO("query perf hint from passthrough network failed");
-        }
-        if (_autoSContext->_batchingDisabled || perfmode != CONFIG_VALUE(THROUGHPUT)) {
-            syncRequestImpl->setPointerToSo(_passthroughExeNet._so);
-        } else {
+    if (_passthroughExeNet) {
        auto so = _passthroughExeNet._ptr->GetPointerToSo();
        // Get the _so from passthrough executable network when batch plugin is disable.
        if (!so)
            so = _passthroughExeNet._so;
        syncRequestImpl->setPointerToSo(so);
-        }
    } else if (std::static_pointer_cast<MultiDeviceInferRequest>(syncRequestImpl)->GetSharedRequest()) {
        // cumulative case, load to MULTI:*
        auto sharedMultiRequest = std::static_pointer_cast<MultiDeviceInferRequest>(syncRequestImpl)->GetSharedRequest();
--- a/src/plugins/auto/auto_schedule.hpp
+++ b/src/plugins/auto/auto_schedule.hpp
@ -50,6 +50,7 @@ public:

 public:
    AutoLoadContext                           _loadContext[CONTEXTNUM];
+    std::unique_ptr<AutoLoadContext[]>        _pCTPUTLoadContext = nullptr;

 protected:
    void GenerateWorkers(const std::string& device, const SoExecNetwork& executableNetwork) override;
@ -60,7 +61,7 @@ protected:

 private:
    void WaitFirstNetworkReady();
-    void TryToLoadNetWork(AutoLoadContext& context, const std::string& modelPath, const IE::CNNNetwork& network);
+    void TryToLoadNetWork(AutoLoadContext& context, const std::string& modelPath, const IE::CNNNetwork& network, bool isCumulative);
    bool selectOtherDevice(const std::string& currentDeviceName);
    IE::Task releaseActualdeviceTask;

--- a/src/plugins/auto/multi_schedule.cpp
+++ b/src/plugins/auto/multi_schedule.cpp
@ -307,22 +307,11 @@ IInferPtr MultiSchedule::CreateInferRequest() {
        syncRequestImpl = CreateInferRequestImpl(execNetwork->_networkInputs, execNetwork->_networkOutputs);
    syncRequestImpl->setPointerToExecutableNetworkInternal(execNetwork);
    if (_passthroughExeNet) {
-        std::string perfmode;
-        try {
-            perfmode = _passthroughExeNet->GetConfig(
-                                CONFIG_KEY(PERFORMANCE_HINT)).as<std::string>();
-        } catch (const IE::Exception&) {
-            LOG_INFO("query perf hint from passthrough network failed");
-        }
-        if (_multiSContext->_batchingDisabled || perfmode != CONFIG_VALUE(THROUGHPUT)) {
-            syncRequestImpl->setPointerToSo(_passthroughExeNet._so);
-        } else {
        auto so = _passthroughExeNet._ptr->GetPointerToSo();
        // Get the _so from passthrough executable network when batch plugin is disable.
        if (!so)
            so = _passthroughExeNet._so;
        syncRequestImpl->setPointerToSo(so);
-        }
    } else if (_multiSContext->_bindBuffer) {
        auto sharedRequest = std::static_pointer_cast<MultiDeviceInferRequest>(syncRequestImpl)->GetSharedRequest();
        if (sharedRequest._ptr->getPointerToSo())
--- a/src/tests/functional/plugin/shared/include/behavior/plugin/caching_tests.hpp
+++ b/src/tests/functional/plugin/shared/include/behavior/plugin/caching_tests.hpp
@ -6,7 +6,7 @@

 #include <string>
 #include <vector>
-
+#include <thread>
 #include "shared_test_classes/base/layer_test_utils.hpp"
 #include "ngraph/function.hpp"
 #include "ngraph_functions/subgraph_builders.hpp"
@ -90,9 +90,11 @@ protected:
        } else {
            m_extList.push_back(ext);
        }
-        std::replace(test_name.begin(), test_name.end(), '/', '_');
-        std::replace(test_name.begin(), test_name.end(), '\\', '_');
-        cache_path = "LoadNetwork" + test_name + "_cache";
+        auto hash = std::hash<std::string>()(test_name);
+        std::stringstream ss;
+        ss << std::this_thread::get_id();
+        cache_path = "LoadNetwork" + std::to_string(hash) + "_"
+                + ss.str() + "_" + GetTimestamp() + "_cache";
    }
    void TearDown() override {
        APIBaseTest::TearDown();
--- a/src/tests/functional/plugin/shared/src/behavior/ov_plugin/caching_tests.cpp
+++ b/src/tests/functional/plugin/shared/src/behavior/ov_plugin/caching_tests.cpp
@ -515,9 +515,11 @@ void CompiledKernelsCacheTest::SetUp() {
    } else {
        m_extList.push_back(ext);
    }
-    std::replace(test_name.begin(), test_name.end(), '/', '_');
-    std::replace(test_name.begin(), test_name.end(), '\\', '_');
-    cache_path = "compiledModel" + test_name + "_cache";
+    auto hash = std::hash<std::string>()(test_name);
+    std::stringstream ss;
+    ss << std::this_thread::get_id();
+    cache_path = "compiledModel" + std::to_string(hash) + "_"
+                + ss.str() + "_" + GetTimestamp() + "_cache";
 }

 void CompiledKernelsCacheTest::TearDown() {
--- a/src/tests/unit/auto/auto_ctput_test.cpp
+++ b/src/tests/unit/auto/auto_ctput_test.cpp
@ -200,12 +200,6 @@ TEST_P(LoadNetworkWithCTPUTMockTest, CTPUTSingleDevLogicTest) {
                                ::testing::Matcher<const std::map<std::string, std::string>&>(
                                    ComparePerfHint(InferenceEngine::PluginConfigParams::THROUGHPUT))))
            .Times(1);
-        // no MULTI logic to be called
-        EXPECT_CALL(*core,
-                    LoadNetwork(::testing::Matcher<const InferenceEngine::CNNNetwork&>(_),
-                                ::testing::Matcher<const std::string&>("MULTI:" + targetDevice),
-                                ::testing::Matcher<const std::map<std::string, std::string>&>(_)))
-            .Times(0);
        // if target device only has GPU, no CPU helper to be called
        if (targetDevice.find("GPU") != std::string::npos) {
            EXPECT_CALL(*core,
@ -220,14 +214,14 @@ TEST_P(LoadNetworkWithCTPUTMockTest, CTPUTSingleDevLogicTest) {
        for (auto& deviceName : targetDevices) {
            targetDev += deviceName;
            targetDev += ((deviceName == targetDevices.back()) ? "" : ",");
-        }
-        config.insert({InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES, targetDev});
-        // Call MULTI logic
            EXPECT_CALL(*core,
                        LoadNetwork(::testing::Matcher<const InferenceEngine::CNNNetwork&>(_),
-                                ::testing::Matcher<const std::string&>("MULTI:" + targetDev),
-                                ::testing::Matcher<const std::map<std::string, std::string>&>(_)))
+                                    ::testing::Matcher<const std::string&>(deviceName),
+                                    ::testing::Matcher<const std::map<std::string, std::string>&>(
+                                        ComparePerfHint(InferenceEngine::PluginConfigParams::THROUGHPUT))))
                .Times(1);
+        }
+        config.insert({InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES, targetDev});
        // no CPU helper to be called
        EXPECT_CALL(*core,
                    LoadNetwork(::testing::Matcher<const InferenceEngine::CNNNetwork&>(_),
--- a/src/tests/unit/auto/auto_runtime_fallback_test.cpp
+++ b/src/tests/unit/auto/auto_runtime_fallback_test.cpp
@ -230,6 +230,8 @@ public:
    }
 };

+using AutoCTPUTRuntimeFallback = AutoRuntimeFallback;
+
 TEST_P(AutoRuntimeFallback, releaseResource) {
    std::string targetDev;
    std::vector<std::tuple<std::string, bool>> targetDevices;
@ -362,3 +364,93 @@ const std::vector<ConfigParams> testConfigs = {
 INSTANTIATE_TEST_SUITE_P(smoke_AutoRuntimeFallback, AutoRuntimeFallback,
                ::testing::ValuesIn(testConfigs),
           AutoRuntimeFallback::getTestCaseName);
+
+TEST_P(AutoCTPUTRuntimeFallback, ctputDeviceInferFailTest) {
+    std::string targetDev;
+    std::vector<std::tuple<std::string, bool>> targetDevices; //std::tuple<deviceName, will infer throw exception>
+    int loadNetworkNum;
+    bool enableRumtimeFallback;
+    bool expectThrow;
+    bool loadNetworkFail;
+    bool generateWorkersFail;
+    std::tie(targetDevices, loadNetworkNum, enableRumtimeFallback, expectThrow, loadNetworkFail, generateWorkersFail) = this->GetParam();
+    if (loadNetworkFail) {
+        ON_CALL(*core, LoadNetwork(::testing::Matcher<const InferenceEngine::CNNNetwork&>(_),
+            ::testing::Matcher<const std::string&>(StrEq("GPU.1")),
+            ::testing::Matcher<const Config&>(_))).WillByDefault(Throw(InferenceEngine::GeneralError{""}));
+    }
+    for (auto& deviceInfo : targetDevices) {
+        std::string deviceName;
+        bool ifThrow;
+        std::tie(deviceName, ifThrow) = deviceInfo;
+        targetDev += deviceName;
+        targetDev += ((deviceInfo == targetDevices.back()) ? "" : ",");
+        if (deviceName == "CPU") {
+            mockInferrequest = std::make_shared<mockAsyncInferRequest>(
+                inferReqInternal, mockExecutor, nullptr, ifThrow);
+            ON_CALL(*mockIExeNet.get(), CreateInferRequest()).WillByDefault(Return(mockInferrequest));
+        } else if (deviceName == "GPU.0") {
+            mockInferrequestGPU_0 = std::make_shared<mockAsyncInferRequest>(
+                inferReqInternalGPU_0, mockExecutorGPU_0, nullptr, ifThrow);
+            ON_CALL(*mockIExeNetGPU_0.get(), CreateInferRequest()).WillByDefault(InvokeWithoutArgs([this]() {
+                        std::this_thread::sleep_for(std::chrono::milliseconds(0));
+                        return mockInferrequestGPU_0; }));
+        } else if (deviceName == "GPU.1") {
+            if (generateWorkersFail) {
+                mockInferrequestGPU_1 =
+                    std::make_shared<mockAsyncInferRequest>(inferReqInternalGPU_1, mockExecutorGPU_1, nullptr, ifThrow);
+                ON_CALL(*mockIExeNetGPU_1.get(), CreateInferRequest())
+                    .WillByDefault(Throw(InferenceEngine::GeneralError{""}));
+            } else {
+                mockInferrequestGPU_1 =
+                    std::make_shared<mockAsyncInferRequest>(inferReqInternalGPU_1, mockExecutorGPU_1, nullptr, ifThrow);
+                ON_CALL(*mockIExeNetGPU_1.get(), CreateInferRequest()).WillByDefault(InvokeWithoutArgs([this]() {
+                    std::this_thread::sleep_for(std::chrono::milliseconds(0));
+                    return mockInferrequestGPU_1;
+                }));
+            }
+        } else {
+            return;
+        }
+    }
+    plugin->SetName("AUTO");
+    config.insert({InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES, targetDev});
+    config.insert({InferenceEngine::PluginConfigParams::KEY_PERFORMANCE_HINT,
+                   InferenceEngine::PluginConfigParams::CUMULATIVE_THROUGHPUT});
+    if (!enableRumtimeFallback) {
+        config.insert({{"ENABLE_RUNTIME_FALLBACK", "NO"}});
+    }
+
+    EXPECT_CALL(*core,
+                LoadNetwork(::testing::Matcher<const InferenceEngine::CNNNetwork&>(_),
+                            ::testing::Matcher<const std::string&>(_),
+                            ::testing::Matcher<const std::map<std::string, std::string>&>(_)))
+        .Times(loadNetworkNum);
+
+    std::shared_ptr<InferenceEngine::IExecutableNetworkInternal> exeNetwork;
+    std::shared_ptr<IInferRequestInternal> infer_request;
+
+    ASSERT_NO_THROW(exeNetwork = plugin->LoadExeNetworkImpl(cnnNet, config));
+    ASSERT_NO_THROW(infer_request = exeNetwork->CreateInferRequest());
+    if (expectThrow) {
+        EXPECT_THROW(infer_request->Infer(), IE::Exception);
+    } else {
+        ASSERT_NO_THROW(infer_request->Infer());
+    }
+}
+
+// ConfigParams: targetDevices(deviceName, will infer throw exception), loadNetworkNum, enableRumtimeFallback,
+// expectThrow, loadNetworkFail, generateWorkersFail
+const std::vector<ConfigParams> testCtputConfigs = {
+    ConfigParams{{{"CPU", false}, {"GPU.0", true}, {"GPU.1", true}}, 3, true, false, false, false},
+    ConfigParams{{{"CPU", true}, {"GPU.0", false}, {"GPU.1", true}}, 3, true, false, false, false},
+    ConfigParams{{{"CPU", true}, {"GPU.0", true}, {"GPU.1", true}}, 3, true, true, false, false},
+    // disable RumtimeFallback
+    ConfigParams{{{"CPU", false}, {"GPU.0", false}, {"GPU.1", false}}, 3, false, false, false, false},
+    ConfigParams{{{"CPU", true}, {"GPU.0", false}, {"GPU.1", false}}, 3, false, true, false, false},
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_AutoCTPUTRuntimeFallback,
+                         AutoCTPUTRuntimeFallback,
+                         ::testing::ValuesIn(testCtputConfigs),
+                         AutoCTPUTRuntimeFallback::getTestCaseName);