Runtime fallback to other devices (#16015)

* Runtime fallback to other devices * Update properties.hpp * Update infer callback in AUTO * Avoid some hang cases * Add test cases for AUTO runtime fallback * Replace mockExecutor with ImmediateExecutor * Update the runtime fallback logic * Update test case and support the case thar infer failed on CPU_HELP * Update the test to detect whether to throw exception * fix the error of CTPUT * Add lock to AUTO executable network GetContext * Update variable name in selectOtherDevice API * Simplify variables and add testcase to improve test coverage * Fix the issues when release CPU_HELP device and clean up the code * Clean up code
2023-03-20 10:13:07 +08:00
parent b2a2266f60
commit 9c7f7b8338
12 changed files with 610 additions and 43 deletions
--- a/src/inference/include/openvino/runtime/auto/properties.hpp
+++ b/src/inference/include/openvino/runtime/auto/properties.hpp
@@ -19,9 +19,14 @@ namespace intel_auto {
 static constexpr Property<bool> device_bind_buffer{"DEVICE_BIND_BUFFER"};

 /**
- * @brief auto/multi device setting that enable/disable CPU as acceleration (or helper device) at the beginning
+ * @brief auto device setting that enable/disable CPU as acceleration (or helper device) at the beginning
 */
 static constexpr Property<bool> enable_startup_fallback{"ENABLE_STARTUP_FALLBACK"};

+/**
+ * @brief auto device setting that enable/disable runtime fallback to other devices when infer fails on current
+ * selected device
+ */
+static constexpr Property<bool> enable_runtime_fallback{"ENABLE_RUNTIME_FALLBACK"};
 }  // namespace intel_auto
 }  // namespace ov
--- a/src/plugins/auto/auto_executable_network.cpp
+++ b/src/plugins/auto/auto_executable_network.cpp
@@ -17,8 +17,13 @@ AutoExecutableNetwork::AutoExecutableNetwork(AutoScheduleContext::Ptr& context,
 }

 std::shared_ptr<IE::RemoteContext> AutoExecutableNetwork::GetContext() const {
-    _autoSchedule->WaitActualNetworkReady();
-    return _autoSchedule->_loadContext[ACTUALDEVICE].executableNetwork->GetContext();
+    std::lock_guard<std::mutex> lock(_autoSContext->_fallbackMutex);
+    if (_autoSchedule->_loadContext[FALLBACKDEVICE].isAlready) {
+        return _autoSchedule->_loadContext[FALLBACKDEVICE].executableNetwork->GetContext();
+    } else {
+        _autoSchedule->WaitActualNetworkReady();
+        return _autoSchedule->_loadContext[ACTUALDEVICE].executableNetwork->GetContext();
+    }
 }

 void AutoExecutableNetwork::SetConfig(const std::map<std::string, IE::Parameter>
--- a/src/plugins/auto/auto_schedule.cpp
+++ b/src/plugins/auto/auto_schedule.cpp
@@ -79,25 +79,114 @@ void AutoSchedule::GenerateWorkers(const std::string& device,
                IdleGuard<NotBusyPriorityWorkerRequests> idleGuard{workerRequestPtr, *idleWorkerRequestsPtr};
                workerRequestPtr->_exceptionPtr = exceptionPtr;
                {
-                    auto capturedTask = std::move(workerRequestPtr->_task);
-                    capturedTask();
-                }
-                // try to return the request to the idle list (fails if the overall object destruction has began)
-                if (idleGuard.Release()->try_push(std::make_pair(workerRequestPtr->_index, workerRequestPtr))) {
-                    // let's try to pop a task, as we know there is at least one idle request, schedule if succeeded
-                    // if no device-agnostic tasks, let's try pop the device specific task, schedule if succeeded
-                    IE::Task t;
-                    do {
-                        _inferPipelineTasks.try_pop(t);
-                    } while (t && ScheduleToWorkerInferRequest(std::move(t)));
-                    do {
-                        _inferPipelineTasksDeviceSpecific[device]->try_pop(t);
-                    } while (t && ScheduleToWorkerInferRequest(std::move(t), device));
+                    auto stopRetryAndContinue = [workerRequestPtr]() {
+                        auto capturedTask = std::move(workerRequestPtr->_task);
+                        capturedTask();
+                    };
+                    // will fallback to other devices if enable _runtimeFallback
+                    if (workerRequestPtr->_exceptionPtr != nullptr && _autoSContext->_runtimeFallback) {
+                        bool selectOtherDeviceFlag = false;
+                        // select other device
+                        try {
+                            selectOtherDeviceFlag = selectOtherDevice(device);
+                        } catch (const IE::Exception& iie) {
+                            LOG_DEBUG_TAG("select other devices with error: %s", iie.what());
+                            selectOtherDeviceFlag = false;
+                        }
+                        if (selectOtherDeviceFlag) {
+                            // Add end time to current workerRequest and restart the task in pipeline
+                            workerRequestPtr->_endTimes.push_back(std::chrono::steady_clock::now());
+                            workerRequestPtr->_fallbackExec->_task();
+                        } else {
+                            // continue to run the task in pipeline
+                            stopRetryAndContinue();
+                        }
+                    } else {
+                        stopRetryAndContinue();
+                    }
+                    // try to return the request to the idle list (fails if the overall object destruction has began)
+                    if (idleGuard.Release()->try_push(std::make_pair(workerRequestPtr->_index, workerRequestPtr))) {
+                        // let's try to pop a task, as we know there is at least one idle request, schedule if succeeded
+                        // if no device-agnostic tasks, let's try pop the device specific task, schedule if succeeded
+                        IE::Task t;
+                        do {
+                            _inferPipelineTasks.try_pop(t);
+                        } while (t && ScheduleToWorkerInferRequest(std::move(t)));
+                        do {
+                            _inferPipelineTasksDeviceSpecific[device]->try_pop(t);
+                        } while (t && ScheduleToWorkerInferRequest(std::move(t), device));
+                    }
                }
            });
    }
 }

+bool AutoSchedule::selectOtherDevice(const std::string& currentDeviceName) {
+    {
+        std::lock_guard<std::mutex> lock(_autoSContext->_fallbackMutex);
+        // a recursive function to select other devices
+        std::function<bool(std::string)> getExecutionDevices;
+        getExecutionDevices = [&](const std::string& deviceName) {
+            std::string realDeviceName;
+            bool isCPUHelp = false;
+            if (_autoSContext->_modelPath.empty())
+                _loadContext[FALLBACKDEVICE].networkPrecision = GetNetworkPrecision(_autoSContext->_network);
+            if (deviceName == "CPU_HELP") {
+                // if infer failed in CPU_HELP, we will remove CPU from _devicePriorities
+                // and re-run infer request when _loadContext[ACTUALDEVICE] is ready
+                realDeviceName = "CPU";
+                isCPUHelp = true;
+                WaitActualNetworkReady();
+            } else {
+                realDeviceName = deviceName;
+            }
+            const auto CurrentDeviceIter = std::find_if(_autoSContext->_devicePriorities.begin(), _autoSContext->_devicePriorities.end(),
+                                                [=](const DeviceInformation& d) -> bool {
+                                                return d.deviceName.find(realDeviceName) != std::string::npos;});
+            if (CurrentDeviceIter != _autoSContext->_devicePriorities.end()) {
+                if (_autoSContext->_devicePriorities.size() == 1) {
+                    LOG_INFO_TAG("No other devices in _devicePriorities");
+                    return false;
+                }
+                _autoSContext->_devicePriorities.erase(CurrentDeviceIter);
+                if (isCPUHelp) {
+                    return true;
+                }
+            } else {
+                LOG_DEBUG_TAG("Already selected the fallback device");
+                return _loadContext[FALLBACKDEVICE].isReloadSuccess ? true : false;
+            }
+            _loadContext[FALLBACKDEVICE].metaDevices = _autoSContext->_devicePriorities;
+            _loadContext[FALLBACKDEVICE].isLoadSuccess = false;
+            _loadContext[FALLBACKDEVICE].workName = "";
+            _loadContext[FALLBACKDEVICE].isReloadSuccess = false;
+            _loadContext[FALLBACKDEVICE].deviceInfo =
+                _autoSContext->_plugin->SelectDevice(_autoSContext->_devicePriorities,
+                                                        _loadContext[FALLBACKDEVICE].networkPrecision,
+                                                        _autoSContext->_modelPriority);
+            try {
+                _loadContext[FALLBACKDEVICE].task();
+                // FALLBACKDEVICE need to be load again if infer failed, so reset promise here
+                _loadContext[FALLBACKDEVICE].promise = {};
+                _loadContext[FALLBACKDEVICE].future = _loadContext[FALLBACKDEVICE].promise.get_future();
+            } catch (const IE::Exception& iie) {
+                LOG_DEBUG_TAG("Load context in FALLBACKDEVICE with error: %s", iie.what());
+            }
+            if (_loadContext[FALLBACKDEVICE].isReloadSuccess) {
+                _loadContext[ACTUALDEVICE].isEnabled = false;
+                _loadContext[ACTUALDEVICE].isLoadSuccess = false;
+                _loadContext[ACTUALDEVICE].isAlready = false;
+                LOG_INFO_TAG("Select fallback device:%s", _loadContext[FALLBACKDEVICE].deviceInfo.deviceName.c_str());
+                return true;
+            } else {
+                // load failed or generate works failed, so reselect other devices
+                return getExecutionDevices(_loadContext[FALLBACKDEVICE].deviceInfo.deviceName.c_str());
+            }
+        };
+        return getExecutionDevices(currentDeviceName);
+    }
+}
+
 void AutoSchedule::init(const ScheduleContext::Ptr& sContext) {
    _LogTag = sContext->_LogTag;
    LOG_INFO_TAG("ExecutableNetwork start");
@@ -116,6 +205,9 @@ void AutoSchedule::init(const ScheduleContext::Ptr& sContext) {
    // loadContext[ACTUALDEVICE] is always enabled,
    // when there is CPU and there are more than two devices, loadContext[CPU] is enabled
    _loadContext[ACTUALDEVICE].isEnabled = true;
+    if (_autoSContext->_runtimeFallback) {
+        _loadContext[FALLBACKDEVICE].isEnabled = true;
+    }
    if (_autoSContext->_modelPath.empty())
        _loadContext[ACTUALDEVICE].networkPrecision = GetNetworkPrecision(_autoSContext->_network);
    _loadContext[ACTUALDEVICE].metaDevices = _autoSContext->_devicePriorities;
@@ -179,7 +271,7 @@ void AutoSchedule::init(const ScheduleContext::Ptr& sContext) {
    // initialize the rest members of load context
    for (int i = 0; i < CONTEXTNUM; i++) {
        if (_loadContext[i].isEnabled) {
-            _loadContext[i].future =  _loadContext[i].promise.get_future();
+            _loadContext[i].future = _loadContext[i].promise.get_future();
            auto* contextPtr = &_loadContext[i];
            auto modelPath = _autoSContext->_modelPath;
            auto network = _autoSContext->_network;
@@ -197,12 +289,14 @@ void AutoSchedule::init(const ScheduleContext::Ptr& sContext) {
                        _autoSContext->_config.insert(contextPtr->deviceInfo.config.begin(), contextPtr->deviceInfo.config.end());
                    }
                    contextPtr->isAlready = true;
+                    // reloadsuccess flag only for _loadContext[FALLBACKDEVICE]
+                    contextPtr->isReloadSuccess = true;
                    auto& deviceName = contextPtr->deviceInfo.deviceName;
                    LOG_INFO_TAG("device:%s loading Network finished", deviceName.c_str());
                    if (!isCumulative) {
                        auto supported_config_keys =
                            _autoSContext->_core->GetMetric(deviceName, METRIC_KEY(SUPPORTED_CONFIG_KEYS))
-                                      .as<std::vector<std::string>>();
+                                          .as<std::vector<std::string>>();
                        DEBUG_RUN([this, &contextPtr, &deviceName, &supported_config_keys] {
                            std::lock_guard<std::mutex> lock(_autoSContext->_confMutex);
                            for (const auto& cfg : supported_config_keys) {
@@ -256,8 +350,14 @@ void AutoSchedule::init(const ScheduleContext::Ptr& sContext) {
                _loadContext[CPU].future.wait();
                // clean up helper infer requests
                // first, wait for all the remaining requests to finish
-                for (auto& iter : _workerRequests["CPU_HELP"]) {
-                    iter._inferRequest._ptr->Wait(IE::InferRequest::WaitMode::RESULT_READY);
+                if (!_autoSContext->_runtimeFallback) {
+                    for (auto& iter : _workerRequests["CPU_HELP"]) {
+                        try {
+                            iter._inferRequest._ptr->Wait(IE::InferRequest::WaitMode::RESULT_READY);
+                        } catch (const IE::Exception& iie) {
+                            LOG_DEBUG_TAG("No infer results expected, infer in CPU_HELP throw some errors: %s", iie.what());
+                        }
+                    }
                }
                // late enough to check the idle queue now
                // second, check the idle queue if all requests are in place
@@ -301,6 +401,15 @@ void AutoSchedule::init(const ScheduleContext::Ptr& sContext) {
            }
        };
        _executor->run(std::move(recycleTask));
+    } else if (_autoSContext->_devicePriorities.size() != 1 && !isCumulative && _autoSContext->_runtimeFallback) {
+        // The performance will has some drop then _passthroughExeNet when enable ENABLE_RUNTIME_FALLBACK
+        for (auto&& device : _autoSContext->_devicePriorities) {
+            // initialize containers before run async task
+            _idleWorkerRequests[device.deviceName];
+            _workerRequests[device.deviceName];
+            _inferPipelineTasksDeviceSpecific[device.deviceName] = nullptr;
+        }
+        _loadContext[ACTUALDEVICE].task();
    } else {
        // only one device need to load network, do not need to load it async
        _loadContext[ACTUALDEVICE].task();
@@ -412,13 +521,13 @@ void AutoSchedule::WaitFirstNetworkReady() {
        _firstLoadFuture.wait();
    }
    // check if there is any device that have loaded network successfully
-    for (int i = CONTEXTNUM - 1; i >= 0; i--) {
+    for (int i = CONTEXTNUM - 2; i >= 0; i--) {
        if (_loadContext[i].isEnabled && _loadContext[i].isAlready) {
            return;
        }
    }
    // the first loading is failed, wait for another loading
-    for (int i = CONTEXTNUM - 1; i >= 0; i--) {
+    for (int i = CONTEXTNUM - 2; i >= 0; i--) {
        if (_loadContext[i].isEnabled) {
            _loadContext[i].future.wait();
            // check if loading is successful
@@ -428,7 +537,7 @@ void AutoSchedule::WaitFirstNetworkReady() {
        }
    }
    //print errMessage
-    for (int i = CONTEXTNUM - 1; i >= 0; i--) {
+    for (int i = CONTEXTNUM - 2; i >= 0; i--) {
        if (_loadContext[i].isEnabled) {
            LOG_ERROR_TAG("load failed, %s", _loadContext[i].errMessage.c_str());
        }
@@ -460,14 +569,18 @@ bool AutoSchedule::ScheduleToWorkerInferRequest(IE::Task inferPipelineTask, Devi
        devices.push_back(_loadContext[ACTUALDEVICE].deviceInfo);
    } else {
        // _acceleratorDevice could be the same as _cpuDevice, such as AUTO:CPU
-        if (_loadContext[ACTUALDEVICE].isAlready) {
-            devices.push_back(_loadContext[ACTUALDEVICE].deviceInfo);
+        if (_loadContext[FALLBACKDEVICE].isAlready) {
+            devices.push_back(_loadContext[FALLBACKDEVICE].deviceInfo);
        } else {
-            // replace deviceName with workName, so schedule can select correct
-            // idleWorkerQueue
-            auto deviceInfo =  _loadContext[CPU].deviceInfo;
-            deviceInfo.deviceName = _loadContext[CPU].workName;
-            devices.push_back(std::move(deviceInfo));
+            if (_loadContext[ACTUALDEVICE].isAlready) {
+                devices.push_back(_loadContext[ACTUALDEVICE].deviceInfo);
+            } else {
+                // replace deviceName with workName, so schedule can select correct
+                // idleWorkerQueue
+                auto deviceInfo =  _loadContext[CPU].deviceInfo;
+                deviceInfo.deviceName = _loadContext[CPU].workName;
+                devices.push_back(std::move(deviceInfo));
+            }
        }
    }
    for (auto&& device : devices) {
--- a/src/plugins/auto/auto_schedule.hpp
+++ b/src/plugins/auto/auto_schedule.hpp
@@ -18,6 +18,7 @@ struct AutoLoadContext {
    std::atomic<bool> isEnabled = {false};
    std::atomic<bool> isAlready = {false};
    std::atomic<bool> isLoadSuccess = {false};
+    std::atomic<bool> isReloadSuccess = {false};
    std::future<void> future;
    std::promise<void> promise;
    SoExecNetwork executableNetwork;
@@ -36,7 +37,8 @@ struct AutoLoadContext {
 enum AutoLoadContextIndex {
    CPU = 0,
    ACTUALDEVICE = 1,
-    CONTEXTNUM = 2
+    FALLBACKDEVICE = 2,
+    CONTEXTNUM = 3
 };
 class AutoSchedule : public MultiSchedule {
 public:
@@ -52,12 +54,15 @@ public:
 protected:
    void GenerateWorkers(const std::string& device, const SoExecNetwork& executableNetwork) override;
    bool ScheduleToWorkerInferRequest(IE::Task, DeviceName preferred_device = "") override;
-    static bool RunPipelineTask(IE::Task& inferPipelineTask, NotBusyPriorityWorkerRequests& idleWorkerRequests, const DeviceName& preferred_device);
+    static bool RunPipelineTask(IE::Task& inferPipelineTask, NotBusyPriorityWorkerRequests& idleWorkerRequests,
+                                const DeviceName& preferred_device);
    DeviceMap<NotBusyPriorityWorkerRequests> _idleWorkerRequests;

 private:
    void WaitFirstNetworkReady();
    void TryToLoadNetWork(AutoLoadContext& context, const std::string& modelPath, const IE::CNNNetwork& network);
+    bool selectOtherDevice(const std::string& currentDeviceName);
+    IE::Task releaseActualdeviceTask;

 private:
    IE::IStreamsExecutor::Ptr                _executor;
--- a/src/plugins/auto/common.hpp
+++ b/src/plugins/auto/common.hpp
@@ -42,6 +42,26 @@ using Time = std::chrono::time_point<std::chrono::steady_clock>;

 template<typename T>
 using DeviceMap = std::unordered_map<DeviceName, T>;
+
+struct MultiImmediateExecutor : public IE::ITaskExecutor {
+public:
+    /**
+     * @brief A shared pointer to a ImmediateExecutor object
+     */
+    using Ptr = std::shared_ptr<MultiImmediateExecutor>;
+
+    /**
+     * @brief Destroys the object.
+     */
+    ~MultiImmediateExecutor() override = default;
+
+    void run(IE::Task task) override {
+        _task = std::move(task);
+        _task();
+    }
+    InferenceEngine::Task _task;
+};
+
 struct DeviceInformation {
    DeviceName deviceName;
    std::map<std::string, std::string> config;
@@ -58,6 +78,7 @@ struct WorkerInferRequest {
    std::list<Time>    _startTimes;
    std::list<Time>    _endTimes;
    int                _index = 0;
+    MultiImmediateExecutor::Ptr  _fallbackExec;
 };

 using NotBusyPriorityWorkerRequests = IE::ThreadSafeBoundedPriorityQueue<std::pair<int, WorkerInferRequest*>>;
@@ -124,6 +145,7 @@ public:
    bool                                           _batchingDisabled = {false};
    bool                                           _bindBuffer = false;
    bool                                           _startupfallback = true;
+    bool                                           _runtimeFallback = true;
    virtual ~MultiScheduleContext() = default;
 };

@@ -137,6 +159,7 @@ public:
    unsigned int                _modelPriority = 0;
    std::string                 _performanceHint;
    std::mutex                  _confMutex;
+    std::mutex                  _fallbackMutex;
    MultiDeviceInferencePlugin* _plugin;
    virtual ~AutoScheduleContext() = default;
 };
--- a/src/plugins/auto/multi_schedule.cpp
+++ b/src/plugins/auto/multi_schedule.cpp
@@ -54,10 +54,11 @@ Pipeline MultiSchedule::GetPipeline(const IInferPtr& syncInferRequest, WorkerInf
            }
        });
    } else {
+        MultiImmediateExecutor::Ptr _firstExecutor = std::make_shared<MultiImmediateExecutor>();
        pipeline = {
            // if the request is coming with device-specific remote blobs make sure it is scheduled to the specific device only:
            Stage {
-                /*TaskExecutor*/ std::make_shared<IE::ImmediateExecutor>(), /*task*/ [this, &syncInferRequest]() {
+                /*TaskExecutor*/ _firstExecutor, /*task*/ [this, &syncInferRequest]() {
                    // by default, no preferred device:
                    _thisPreferredDeviceName = "";
                    auto execNetwork = _multiSContext->_executableNetwork.lock();
@@ -96,13 +97,18 @@ Pipeline MultiSchedule::GetPipeline(const IInferPtr& syncInferRequest, WorkerInf
                    multiSyncInferRequest->SetBlobsToAnotherRequest(_thisWorkerInferRequest->_inferRequest);
                    INFO_RUN([workerInferRequest]() {
                        (*workerInferRequest)->_startTimes.push_back(std::chrono::steady_clock::now());
-                });
+                        });
                }},
            // final task in the pipeline:
            Stage {
-                /*TaskExecutor*/std::make_shared<ThisRequestExecutor>(workerInferRequest), /*task*/ [this, &syncInferRequest, workerInferRequest]() {
-                    if (nullptr != (*workerInferRequest)->_exceptionPtr) {
-                        std::rethrow_exception((*workerInferRequest)->_exceptionPtr);
+                /*TaskExecutor*/std::make_shared<ThisRequestExecutor>(workerInferRequest, _firstExecutor), /*task*/
+                [this, &syncInferRequest, workerInferRequest]() {
+                    INFO_RUN([workerInferRequest]() {
+                        (*workerInferRequest)->_endTimes.push_back(std::chrono::steady_clock::now());
+                    });
+                    std::exception_ptr eptr = (*workerInferRequest)->_exceptionPtr;
+                    if (nullptr != eptr) {
+                        std::rethrow_exception(eptr);
                    }
                    if (_multiSContext->_needPerfCounters) {
                        auto multiSyncInferRequest = std::dynamic_pointer_cast<MultiDeviceInferRequest>
@@ -110,9 +116,6 @@ Pipeline MultiSchedule::GetPipeline(const IInferPtr& syncInferRequest, WorkerInf
                        multiSyncInferRequest->_scheduledRequest =
                            (*workerInferRequest)->_inferRequest;
                    }
-                    INFO_RUN([workerInferRequest]() {
-                    (*workerInferRequest)->_endTimes.push_back(std::chrono::steady_clock::now());
-                    });
                }}
        };
    }
--- a/src/plugins/auto/multi_schedule.hpp
+++ b/src/plugins/auto/multi_schedule.hpp
@@ -16,12 +16,14 @@

 namespace MultiDevicePlugin {
 struct ThisRequestExecutor : public IE::ITaskExecutor {
-    explicit ThisRequestExecutor(WorkerInferRequest** ptr): _workptrptr{ptr} {}
+    explicit ThisRequestExecutor(WorkerInferRequest** ptr, MultiImmediateExecutor::Ptr executor = nullptr): _workptrptr{ptr}, _fallbackExec(executor) {}
    void run(IE::Task task) override {
        (*_workptrptr)->_task = std::move(task);
+        (*_workptrptr)->_fallbackExec = _fallbackExec;
        (*_workptrptr)->_inferRequest->StartAsync();
    };
    WorkerInferRequest** _workptrptr = nullptr;
+    MultiImmediateExecutor::Ptr _fallbackExec;
 };

 class MultiSchedule : public Schedule, public IE::ITaskExecutor {
@@ -54,7 +56,6 @@ protected:
    DeviceMap<std::unique_ptr<IE::ThreadSafeQueue<IE::Task>>> _inferPipelineTasksDeviceSpecific;
    DeviceMap<NotBusyWorkerRequests>                          _idleWorkerRequests;
    DeviceMap<std::vector<WorkerInferRequest>>                _workerRequests;
-    mutable std::mutex                                        _mutex;
    std::atomic_size_t                                        _numRequestsCreated = {0};
    MultiScheduleContext::Ptr                                 _multiSContext;
    SoExecNetwork                                             _passthroughExeNet;
--- a/src/plugins/auto/plugin.cpp
+++ b/src/plugins/auto/plugin.cpp
@@ -485,6 +485,7 @@ IExecutableNetworkInternal::Ptr MultiDeviceInferencePlugin::LoadNetworkImpl(cons
        autoSContext->_LogTag = _LogTag;
        autoSContext->_bindBuffer = loadConfig.get_property(ov::intel_auto::device_bind_buffer);
        autoSContext->_startupfallback = loadConfig.get_property(ov::intel_auto::enable_startup_fallback);
+        autoSContext->_runtimeFallback = loadConfig.get_property(ov::intel_auto::enable_runtime_fallback);
        return std::make_shared<AutoExecutableNetwork>(autoSContext, std::make_shared<AutoSchedule>());
    }
    OV_ITT_SCOPED_TASK(itt::domains::MULTIPlugin, "MultiDeviceInferencePlugin::LoadNetworkImpl:MultiMode");
--- a/src/plugins/auto/plugin_config.cpp
+++ b/src/plugins/auto/plugin_config.cpp
@@ -26,6 +26,7 @@ void PluginConfig::set_default() {
        std::make_tuple(ov::hint::execution_mode, ov::hint::ExecutionMode::UNDEFINED),
        std::make_tuple(ov::hint::num_requests, 0, UnsignedTypeValidator()),
        std::make_tuple(ov::intel_auto::enable_startup_fallback, true),
+        std::make_tuple(ov::intel_auto::enable_runtime_fallback, true),
        // TODO 1) cache_dir 2) allow_auto_batch 3) auto_batch_timeout
        std::make_tuple(ov::cache_dir, ""),
        std::make_tuple(ov::hint::allow_auto_batching, true),
--- a/src/plugins/auto/utils/plugin_config.hpp
+++ b/src/plugins/auto/utils/plugin_config.hpp
@@ -159,6 +159,9 @@ public:
        multi_supported_configKeys.erase(std::remove(
                                multi_supported_configKeys.begin(), multi_supported_configKeys.end(), ov::intel_auto::enable_startup_fallback.name()),
                                multi_supported_configKeys.end());
+        multi_supported_configKeys.erase(std::remove(
+                                multi_supported_configKeys.begin(), multi_supported_configKeys.end(), ov::intel_auto::enable_runtime_fallback.name()),
+                                multi_supported_configKeys.end());
        return pluginName == "AUTO" ? supported_configKeys : multi_supported_configKeys;
    }

@@ -171,6 +174,9 @@ public:
        multi_supported_properties.erase(std::remove(
                                multi_supported_properties.begin(), multi_supported_properties.end(), ov::intel_auto::enable_startup_fallback),
                                multi_supported_properties.end());
+        multi_supported_properties.erase(std::remove(
+                                multi_supported_properties.begin(), multi_supported_properties.end(), ov::intel_auto::enable_runtime_fallback),
+                                multi_supported_properties.end());
        return pluginName == "AUTO" ? supported_properties : multi_supported_properties;
    }

--- a/src/tests/unit/auto/auto_runtime_fallback_test.cpp
+++ b/src/tests/unit/auto/auto_runtime_fallback_test.cpp
@@ -0,0 +1,364 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+#include <gmock/gmock.h>
+
+#include <ngraph_functions/subgraph_builders.hpp>
+#include <common_test_utils/test_constants.hpp>
+#include <ie_metric_helpers.hpp>
+#include "mock_common.hpp"
+
+#include "unit_test_utils/mocks/cpp_interfaces/interface/mock_icore.hpp"
+#include "unit_test_utils/mocks/cpp_interfaces/impl/mock_inference_plugin_internal.hpp"
+#include "unit_test_utils/mocks/cpp_interfaces/interface/mock_iexecutable_network_internal.hpp"
+#include "unit_test_utils/mocks/cpp_interfaces/interface/mock_iinference_plugin.hpp"
+#include "plugin/mock_auto_device_plugin.hpp"
+#include "plugin/mock_infer_request.hpp"
+
+using ::testing::Throw;
+using ::testing::Matches;
+using ::testing::_;
+using ::testing::StrEq;
+using ::testing::Return;
+using ::testing::InvokeWithoutArgs;
+using ::testing::NiceMock;
+
+using namespace MockMultiDevice;
+using Config = std::map<std::string, std::string>;
+using ConfigParams = std::tuple<std::vector<std::tuple<std::string, bool>>, int, bool, bool, bool, bool>;
+
+class AutoRuntimeFallback : public ::testing::TestWithParam<ConfigParams> {
+public:
+    std::shared_ptr<ngraph::Function>               function;
+    InferenceEngine::CNNNetwork                     cnnNet;
+    std::shared_ptr<NiceMock<MockICore>>                      core;
+    std::shared_ptr<NiceMock<MockMultiDeviceInferencePlugin>> plugin;
+    // config for Auto device
+    std::map<std::string, std::string>              config;
+    std::vector<DeviceInformation>                 metaDevices;
+    //mock exeNetwork helper
+    ov::SoPtr<IExecutableNetworkInternal>  mockExeNetwork;
+    ov::SoPtr<IExecutableNetworkInternal>  mockExeNetworkGPU_0;
+    ov::SoPtr<IExecutableNetworkInternal>  mockExeNetworkGPU_1;
+    ov::SoPtr<IExecutableNetworkInternal>  mockExeNetworkVPUX;
+
+    std::shared_ptr<NiceMock<MockIInferRequestInternal>>     inferReqInternal;
+    std::shared_ptr<NiceMock<MockIInferRequestInternal>>     inferReqInternalGPU_0;
+    std::shared_ptr<NiceMock<MockIInferRequestInternal>>     inferReqInternalGPU_1;
+    std::shared_ptr<NiceMock<MockIInferRequestInternal>>     inferReqInternalVPUX;
+
+    std::shared_ptr<NiceMock<MockIExecutableNetworkInternal>>     mockIExeNet;
+    std::shared_ptr<NiceMock<MockIExecutableNetworkInternal>>     mockIExeNetGPU_0;
+    std::shared_ptr<NiceMock<MockIExecutableNetworkInternal>>     mockIExeNetGPU_1;
+    std::shared_ptr<NiceMock<MockIExecutableNetworkInternal>>     mockIExeNetVPUX;
+
+    std::shared_ptr<mockAsyncInferRequest>     mockInferrequest;
+    std::shared_ptr<mockAsyncInferRequest>     mockInferrequestGPU_0;
+    std::shared_ptr<mockAsyncInferRequest>     mockInferrequestGPU_1;
+    std::shared_ptr<mockAsyncInferRequest>     mockInferrequestVPUX;
+
+    std::shared_ptr<ImmediateExecutor>     mockExecutor;
+    std::shared_ptr<ImmediateExecutor>     mockExecutorGPU_0;
+    std::shared_ptr<ImmediateExecutor>     mockExecutorGPU_1;
+    std::shared_ptr<ImmediateExecutor>     mockExecutorVPUX;
+
+    size_t optimalNum;
+
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<ConfigParams> obj) {
+        std::vector<std::tuple<std::string, bool>> targetDevices;
+        int loadNetworkNum;
+        bool enableRumtimeFallback;
+        bool expectThrow;
+        bool loadNetworkFail;
+        bool generateWorkersFail;
+        std::tie(targetDevices, loadNetworkNum, enableRumtimeFallback, expectThrow, loadNetworkFail, generateWorkersFail) = obj.param;
+        std::ostringstream result;
+        result << "auto_runtime_fallback_";
+        for (auto deviceInfo : targetDevices) {
+            std::string deviceName;
+            bool ifThrow;
+            std::tie(deviceName, ifThrow) = deviceInfo;
+            result << deviceName << "_";
+            if (ifThrow)
+                result << "true_";
+            else
+                result << "false_";
+        }
+        if (enableRumtimeFallback)
+            result << "enableRuntimeFallback";
+        else
+            result << "disableRuntimeFallback";
+        if (loadNetworkFail)
+            result << "loadNetworkFail";
+        if (generateWorkersFail)
+            result << "generateWorkersFail";
+        return result.str();
+    }
+
+    void TearDown() override {
+        core.reset();
+        plugin.reset();
+        mockExeNetwork = {};
+        mockExeNetworkGPU_0 = {};
+        mockExeNetworkGPU_1 = {};
+        config.clear();
+        metaDevices.clear();
+        inferReqInternal.reset();
+        inferReqInternalGPU_0.reset();
+        inferReqInternalGPU_1.reset();
+        inferReqInternalVPUX.reset();
+        mockIExeNet.reset();
+        mockIExeNetGPU_0.reset();
+        mockIExeNetGPU_1.reset();
+        mockIExeNetVPUX.reset();
+        mockIExeNet.reset();
+        mockIExeNetGPU_0.reset();
+        mockIExeNetGPU_1.reset();
+        mockIExeNetVPUX.reset();
+        mockExecutor.reset();
+        mockExecutorGPU_0.reset();
+        mockExecutorGPU_1.reset();
+        mockExecutorVPUX.reset();
+    }
+
+    void SetUp() override {
+        // prepare mockExeNetwork
+        mockIExeNet = std::make_shared<NiceMock<MockIExecutableNetworkInternal>>();
+        mockExeNetwork = {mockIExeNet, {}};
+
+        mockIExeNetGPU_0 = std::make_shared<NiceMock<MockIExecutableNetworkInternal>>();
+        mockExeNetworkGPU_0 = {mockIExeNetGPU_0, {}};
+
+        mockIExeNetGPU_1 = std::make_shared<NiceMock<MockIExecutableNetworkInternal>>();
+        mockExeNetworkGPU_1 = {mockIExeNetGPU_1, {}};
+
+        mockIExeNetVPUX = std::make_shared<NiceMock<MockIExecutableNetworkInternal>>();
+        mockExeNetworkVPUX = {mockIExeNetVPUX, {}};
+
+        // prepare mockicore and cnnNetwork for loading
+        core = std::make_shared<NiceMock<MockICore>>();
+        NiceMock<MockMultiDeviceInferencePlugin>* mock_multi = new NiceMock<MockMultiDeviceInferencePlugin>();
+        plugin.reset(mock_multi);
+        function = ngraph::builder::subgraph::makeConvPoolRelu();
+        cnnNet = InferenceEngine::CNNNetwork(function);
+        plugin->SetCore(core);
+
+        IE_SET_METRIC(SUPPORTED_CONFIG_KEYS, supportConfigs, {});
+        ON_CALL(*core, GetMetric(_, StrEq(METRIC_KEY(SUPPORTED_CONFIG_KEYS)), _)).WillByDefault(Return(supportConfigs));
+        ON_CALL(*core, GetConfig(_, StrEq(ov::compilation_num_threads.name()))).WillByDefault(Return(12));
+        std::vector<std::string> availableDevs = {"CPU", "GPU.0", "GPU.1", "VPUX"};
+        ON_CALL(*core, GetAvailableDevices()).WillByDefault(Return(availableDevs));
+
+        std::vector<std::string> metrics = {METRIC_KEY(SUPPORTED_CONFIG_KEYS)};
+        ON_CALL(*core, GetMetric(_, StrEq(METRIC_KEY(SUPPORTED_METRICS)), _)).WillByDefault(Return(metrics));
+
+        std::vector<std::string> configKeys = {"SUPPORTED_CONFIG_KEYS", "NUM_STREAMS"};
+        ON_CALL(*core, GetMetric(_, StrEq(METRIC_KEY(SUPPORTED_CONFIG_KEYS)), _)).WillByDefault(Return(configKeys));
+
+        ON_CALL(*core, LoadNetwork(::testing::Matcher<const InferenceEngine::CNNNetwork&>(_),
+                    ::testing::Matcher<const std::string&>(StrEq("GPU.0")),
+                    ::testing::Matcher<const Config&>(_))).WillByDefault(InvokeWithoutArgs([this]() {
+                        std::this_thread::sleep_for(std::chrono::milliseconds(200));
+                        return mockExeNetworkGPU_0; }));
+        ON_CALL(*core, LoadNetwork(::testing::Matcher<const InferenceEngine::CNNNetwork&>(_),
+                    ::testing::Matcher<const std::string&>(StrEq("GPU.1")),
+                    ::testing::Matcher<const Config&>(_))).WillByDefault(InvokeWithoutArgs([this]() {
+                        std::this_thread::sleep_for(std::chrono::milliseconds(200));
+                        return mockExeNetworkGPU_1; }));
+        ON_CALL(*core, LoadNetwork(::testing::Matcher<const InferenceEngine::CNNNetwork&>(_),
+                    ::testing::Matcher<const std::string&>(StrEq(CommonTestUtils::DEVICE_KEEMBAY)),
+                    ::testing::Matcher<const Config&>(_))).WillByDefault(InvokeWithoutArgs([this]() {
+                        std::this_thread::sleep_for(std::chrono::milliseconds(200));
+                        return mockExeNetworkVPUX; }));
+
+        ON_CALL(*core, LoadNetwork(::testing::Matcher<const InferenceEngine::CNNNetwork&>(_),
+                    ::testing::Matcher<const std::string&>(StrEq(CommonTestUtils::DEVICE_CPU)),
+                    ::testing::Matcher<const Config&>(_))).WillByDefault(Return(mockExeNetwork));
+
+        ON_CALL(*plugin, ParseMetaDevices)
+            .WillByDefault(
+                [this](const std::string& priorityDevices, const std::map<std::string, std::string>& config) {
+                    return plugin->MultiDeviceInferencePlugin::ParseMetaDevices(priorityDevices, config);
+                });
+
+        ON_CALL(*plugin, SelectDevice)
+            .WillByDefault([this](const std::vector<DeviceInformation>& metaDevices,
+                                  const std::string& netPrecision,
+                                  unsigned int priority) {
+                return plugin->MultiDeviceInferencePlugin::SelectDevice(metaDevices, netPrecision, priority);
+            });
+
+        ON_CALL(*plugin, GetValidDevice)
+            .WillByDefault([this](const std::vector<DeviceInformation>& metaDevices, const std::string& netPrecision) {
+                std::list<DeviceInformation> devices(metaDevices.begin(), metaDevices.end());
+                return devices;
+            });
+
+        ON_CALL(*plugin, GetDeviceList).WillByDefault([this](const std::map<std::string, std::string>& config) {
+            return plugin->MultiDeviceInferencePlugin::GetDeviceList(config);
+        });
+        ON_CALL(*plugin, SelectDevice)
+            .WillByDefault([this](const std::vector<DeviceInformation>& metaDevices,
+                                  const std::string& netPrecision,
+                                  unsigned int Priority) {
+                return plugin->MultiDeviceInferencePlugin::SelectDevice(metaDevices, netPrecision, Priority);
+            });
+
+        inferReqInternal = std::make_shared<NiceMock<MockIInferRequestInternal>>();
+        mockExecutor = std::make_shared<ImmediateExecutor>();
+        IE_SET_METRIC(OPTIMAL_NUMBER_OF_INFER_REQUESTS, optimalNum, 1);
+        ON_CALL(*mockIExeNet.get(), GetMetric(StrEq(METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS))))
+           .WillByDefault(Return(optimalNum));
+
+        inferReqInternalGPU_0 = std::make_shared<NiceMock<MockIInferRequestInternal>>();
+        mockExecutorGPU_0 = std::make_shared<ImmediateExecutor>();
+        ON_CALL(*mockIExeNetGPU_0.get(), GetMetric(StrEq(METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS))))
+           .WillByDefault(Return(optimalNum));
+
+        inferReqInternalGPU_1 = std::make_shared<NiceMock<MockIInferRequestInternal>>();
+        mockExecutorGPU_1 = std::make_shared<ImmediateExecutor>();
+        ON_CALL(*mockIExeNetGPU_1.get(), GetMetric(StrEq(METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS))))
+           .WillByDefault(Return(optimalNum));
+
+        inferReqInternalVPUX = std::make_shared<NiceMock<MockIInferRequestInternal>>();
+        mockExecutorVPUX = std::make_shared<ImmediateExecutor>();
+        ON_CALL(*mockIExeNetVPUX.get(), GetMetric(StrEq(METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS))))
+           .WillByDefault(Return(optimalNum));
+    }
+};
+
+TEST_P(AutoRuntimeFallback, releaseResource) {
+    std::string targetDev;
+    std::vector<std::tuple<std::string, bool>> targetDevices;
+    int loadNetworkNum;
+    bool enableRumtimeFallback;
+    bool expectThrow;
+    bool loadNetworkFail;
+    bool generateWorkersFail;
+    std::tie(targetDevices, loadNetworkNum, enableRumtimeFallback, expectThrow, loadNetworkFail, generateWorkersFail) = this->GetParam();
+    if (loadNetworkFail) {
+        ON_CALL(*core, LoadNetwork(::testing::Matcher<const InferenceEngine::CNNNetwork&>(_),
+            ::testing::Matcher<const std::string&>(StrEq("GPU.1")),
+            ::testing::Matcher<const Config&>(_))).WillByDefault(Throw(InferenceEngine::GeneralError{""}));
+    }
+    for (auto& deviceInfo : targetDevices) {
+        std::string deviceName;
+        bool ifThrow;
+        std::tie(deviceName, ifThrow) = deviceInfo;
+        targetDev += deviceName;
+        targetDev += ((deviceInfo == targetDevices.back()) ? "" : ",");
+        if (deviceName == "CPU") {
+            mockInferrequest = std::make_shared<mockAsyncInferRequest>(
+                inferReqInternal, mockExecutor, nullptr, ifThrow);
+            ON_CALL(*mockIExeNet.get(), CreateInferRequest()).WillByDefault(Return(mockInferrequest));
+        } else if (deviceName == "GPU.0") {
+            mockInferrequestGPU_0 = std::make_shared<mockAsyncInferRequest>(
+                inferReqInternalGPU_0, mockExecutorGPU_0, nullptr, ifThrow);
+            ON_CALL(*mockIExeNetGPU_0.get(), CreateInferRequest()).WillByDefault(InvokeWithoutArgs([this]() {
+                        std::this_thread::sleep_for(std::chrono::milliseconds(0));
+                        return mockInferrequestGPU_0; }));
+        } else if (deviceName == "GPU.1") {
+            if (generateWorkersFail) {
+                mockInferrequestGPU_1 = std::make_shared<mockAsyncInferRequest>(
+                    inferReqInternalGPU_1, mockExecutorGPU_1, nullptr, ifThrow);
+                ON_CALL(*mockIExeNetGPU_1.get(), CreateInferRequest()).WillByDefault(Throw(InferenceEngine::GeneralError{""}));
+            } else {
+                mockInferrequestGPU_1 = std::make_shared<mockAsyncInferRequest>(
+                    inferReqInternalGPU_1, mockExecutorGPU_1, nullptr, ifThrow);
+                ON_CALL(*mockIExeNetGPU_1.get(), CreateInferRequest()).WillByDefault(InvokeWithoutArgs([this]() {
+                            std::this_thread::sleep_for(std::chrono::milliseconds(0));
+                            return mockInferrequestGPU_1; }));
+            }
+        } else if (deviceName == "VPUX") {
+            mockInferrequestVPUX = std::make_shared<mockAsyncInferRequest>(
+                inferReqInternalVPUX, mockExecutorVPUX, nullptr, ifThrow);
+            ON_CALL(*mockIExeNetVPUX.get(), CreateInferRequest()).WillByDefault(InvokeWithoutArgs([this]() {
+                        std::this_thread::sleep_for(std::chrono::milliseconds(0));
+                        return mockInferrequestVPUX; }));
+        } else {
+            return;
+        }
+    }
+    plugin->SetName("AUTO");
+    config.insert({InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES, targetDev});
+    if (!enableRumtimeFallback) {
+        config.insert({{"ENABLE_RUNTIME_FALLBACK", "NO"}});
+    }
+
+    EXPECT_CALL(*core,
+                LoadNetwork(::testing::Matcher<const InferenceEngine::CNNNetwork&>(_),
+                            ::testing::Matcher<const std::string&>(_),
+                            ::testing::Matcher<const std::map<std::string, std::string>&>(_)))
+        .Times(loadNetworkNum);
+
+    std::shared_ptr<InferenceEngine::IExecutableNetworkInternal> exeNetwork;
+    std::shared_ptr<IInferRequestInternal> infer_request;
+
+    ASSERT_NO_THROW(exeNetwork = plugin->LoadExeNetworkImpl(cnnNet, config));
+    ASSERT_NO_THROW(infer_request = exeNetwork->CreateInferRequest());
+    if (expectThrow) {
+        EXPECT_THROW(infer_request->Infer(), IE::Exception);
+    } else {
+        ASSERT_NO_THROW(infer_request->Infer());
+    }
+}
+
+const std::vector<ConfigParams> testConfigs = {
+    ConfigParams{{{"GPU.0", true}, {"GPU.1", true}}, 2, true, true, false, false},
+    ConfigParams{{{"GPU.0", true}, {"GPU.1", false}}, 2, true, false, false, false},
+    ConfigParams{{{"GPU.0", false}, {"GPU.1", true}}, 1, true, false, false, false},
+    ConfigParams{{{"GPU.0", false}, {"GPU.1", false}}, 1, true, false, false, false},
+    //CPU_HELP does not throw
+    ConfigParams{{{"GPU.0", false}, {"CPU", false}}, 2, true, false, false, false},
+    ConfigParams{{{"GPU.0", true}, {"CPU", false}}, 2, true, false, false, false},
+    //CPU_HELP throw
+    ConfigParams{{{"GPU.0", false}, {"CPU", true}}, 2, true, false, false, false},
+    ConfigParams{{{"GPU.0", true}, {"CPU", true}}, 2, true, true, false, false},
+    // 3 devices
+    ConfigParams{{{"GPU.0", false}, {"GPU.1", false}, {"VPUX", false}}, 1, true, false, false, false},
+    ConfigParams{{{"GPU.0", true}, {"GPU.1", false}, {"VPUX", false}}, 2, true, false, false, false},
+    ConfigParams{{{"GPU.0", true}, {"GPU.1", true}, {"VPUX", false}}, 3, true, false, false, false},
+    ConfigParams{{{"GPU.0", true}, {"GPU.1", true}, {"VPUX", true}}, 3, true, true, false, false},
+    //CPU_HELP does not throw
+    ConfigParams{{{"GPU.0", false}, {"GPU.1", false}, {"CPU", false}}, 2, true, false, false, false},
+    ConfigParams{{{"GPU.0", true}, {"GPU.1", false}, {"CPU", false}}, 2, true, false, false, false},
+    ConfigParams{{{"GPU.0", true}, {"GPU.1", true}, {"CPU", false}}, 2, true, false, false, false},
+    //CPU_HELP throw
+    ConfigParams{{{"GPU.0", false}, {"GPU.1", false}, {"CPU", true}}, 2, true, false, false, false},
+    ConfigParams{{{"GPU.0", true}, {"GPU.1", false}, {"CPU", true}}, 3, true, false, false, false},
+    ConfigParams{{{"GPU.0", true}, {"GPU.1", true}, {"CPU", true}}, 3, true, true, false, false},
+    // disable RumtimeFallback
+    ConfigParams{{{"GPU.0", true}, {"GPU.1", true}}, 1, false, true, false, false},
+    ConfigParams{{{"GPU.0", true}, {"GPU.1", false}}, 1, false, true, false, false},
+    ConfigParams{{{"GPU.0", false}, {"GPU.1", true}}, 1, false, false, false, false},
+    ConfigParams{{{"GPU.0", false}, {"GPU.1", false}}, 1, false, false, false, false},
+    //CPU_HELP does not throw
+    ConfigParams{{{"GPU.0", false}, {"CPU", false}}, 2, false, false, false, false},
+    ConfigParams{{{"GPU.0", true}, {"CPU", false}}, 2, false, false, false, false},
+    //CPU_HELP throw
+    ConfigParams{{{"GPU.0", false}, {"CPU", true}}, 2, false, true, false, false},
+    ConfigParams{{{"GPU.0", true}, {"CPU", true}}, 2, false, true, false, false},
+    // 3 devices
+    ConfigParams{{{"GPU.0", false}, {"GPU.1", false}, {"VPUX", false}}, 1, false, false, false, false},
+    ConfigParams{{{"GPU.0", true}, {"GPU.1", false}, {"VPUX", false}}, 1, false, true, false, false},
+    ConfigParams{{{"GPU.0", true}, {"GPU.1", true}, {"VPUX", false}}, 1, false, true, false, false},
+    ConfigParams{{{"GPU.0", true}, {"GPU.1", true}, {"VPUX", true}}, 1, false, true, false, false},
+    //CPU_HELP does not throw
+    ConfigParams{{{"GPU.0", false}, {"GPU.1", false}, {"CPU", false}}, 2, false, false, false, false},
+    ConfigParams{{{"GPU.0", true}, {"GPU.1", false}, {"CPU", false}}, 2, false, false, false, false},
+    ConfigParams{{{"GPU.0", true}, {"GPU.1", true}, {"CPU", false}}, 2, false, false, false, false},
+    //CPU_HELP throw
+    ConfigParams{{{"GPU.0", false}, {"GPU.1", false}, {"CPU", true}}, 2, false, true, false, false},
+    ConfigParams{{{"GPU.0", true}, {"GPU.1", false}, {"CPU", true}}, 2, false, true, false, false},
+    ConfigParams{{{"GPU.0", true}, {"GPU.1", true}, {"CPU", true}}, 2, false, true, false, false},
+    // loadFail and CreateInferRequestFail
+    ConfigParams{{{"GPU.0", true}, {"GPU.1", false}, {"VPUX", false}}, 3, true, false, true, false},
+    ConfigParams{{{"GPU.0", true}, {"GPU.1", false}, {"VPUX", false}}, 3, true, false, false, true},
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_AutoRuntimeFallback, AutoRuntimeFallback,
+                ::testing::ValuesIn(testConfigs),
+           AutoRuntimeFallback::getTestCaseName);
--- a/src/tests/unit/auto/plugin/mock_infer_request.hpp
+++ b/src/tests/unit/auto/plugin/mock_infer_request.hpp
@@ -0,0 +1,40 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include <gmock/gmock.h>
+#include "ie_icore.hpp"
+#include "plugin.hpp"
+#include <iostream>
+
+using namespace MockMultiDevicePlugin;
+namespace MockMultiDevice {
+
+class mockAsyncInferRequest : public InferenceEngine::AsyncInferRequestThreadSafeDefault {
+public:
+    using Parent = InferenceEngine::AsyncInferRequestThreadSafeDefault;
+    mockAsyncInferRequest(const InferenceEngine::IInferRequestInternal::Ptr &inferRequest,
+                      const ImmediateExecutor::Ptr& taskExecutor,
+                      const ImmediateExecutor::Ptr& callbackExecutor,
+                      bool ifThrow);
+
+    ~mockAsyncInferRequest() override = default;
+private:
+    bool _throw;
+};
+
+mockAsyncInferRequest::mockAsyncInferRequest(const InferenceEngine::IInferRequestInternal::Ptr &inferRequest,
+                                     const ImmediateExecutor::Ptr& taskExecutor,
+                                     const ImmediateExecutor::Ptr& callbackExecutor,
+                                     bool ifThrow)
+    : InferenceEngine::AsyncInferRequestThreadSafeDefault(inferRequest, taskExecutor, callbackExecutor), _throw(ifThrow) {
+    _pipeline = {};
+
+    _pipeline.push_back({taskExecutor,
+                [this] {
+                    if (_throw)
+                        IE_THROW();
+    } });
+}
+} // namespace MockMultiDevice