From d0eef043fd978e710cf0f468c2362d0d329fb682 Mon Sep 17 00:00:00 2001
From: Maxim Shevtsov <maxim.y.shevtsov@intel.com>
Date: Wed, 9 Dec 2020 09:52:19 +0300
Subject: [PATCH] [MULTI]Data affinity remote context and blobs (#3342)

* zero-copy (assuming determenistic app-level scheduling) for the multi-device, via "borrowing" the corresponding device-specific blobs and letting the app to implicitly use these

* Optimized Infer Request Scheduling

* remoteblob checks in the conventional SetBlob

* correctly (with status) reporting NOT_IMPLEMENTED

* SetBlob to accomodate for the RemoteBobs

* Tests for remote blobs support via MULTI: creating the shared_test in case the other (closed source) plugins would want to use that (in the private shared_tests instantiations).
Also instantiating the remote blobs tests for the some basic combinations to test the MULTI supports them

* macos compilation (and general plugin platform support) fix

* shuffled files, so that the MULTI tests are now part of the ieFuncTests (and need no separate target). Also brushed the macro that handales the NOT_IMPLEMENTED as bit

* further shuffled files, so that the initial MULTI tests are now part of the IE tests, yet specific instances do need separate targets

* Fixed misprint

* Brushing the code and comments a bit

* further brushing of the ScheduleToWorkerRequest: moving the task execution directly into the loop over devices (avoids pointers and 'else' clause)

* 1) zero-copy (assuming determenistic app-level scheduling) for the multi-device, via "borrowing" the corresponding device-specific blobs and letting the app to implicitly use these

2) Initial MULTI section in the opt guide (primarily to document a tip on helping the MULTI to keep the zero-copy path)

* [MULTI] remote context support and associated scheduling (respecting the remote data affinity)

* fix CentOS (old) gcc issue: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81880
since the intriduced therad_local string is template the bug manifests itself (and the string is not allocated/initialized).
the QA is to wrap the std::string into the function

* further fix for the old gcc versions issue, now with non-trivial thread_local destruction sefault: switching from the std::string to the plain const char*

* additional tests for the MULTI and remote blobs (no remote context and multi GPUs cases)

* fix for the tests (that now can check for more specific NotImplemented exeption).
Alos couple of line endings
---
 .../include/cpp/ie_infer_request.hpp          |  4 +-
 .../multi_device_async_infer_request.cpp      | 61 +++++++++++++-----
 .../multi_device_exec_network.cpp             | 51 +++++++++++----
 .../multi_device_exec_network.hpp             | 11 +++-
 .../impl/ie_executable_network_internal.hpp   | 12 ++--
 .../impl/ie_infer_request_internal.hpp        |  6 +-
 .../tests/functional/plugin/CMakeLists.txt    |  3 +-
 .../multi/cpu_remote_blob_tests.cpp           | 15 +++++
 .../multi/gpu_remote_blob_tests.cpp           | 57 +++++++++++++++++
 .../myriad_remote_blobs_tests.cpp             | 18 ++++++
 .../include/behavior/core_integration.hpp     |  7 +-
 .../include/behavior/exec_graph_info.hpp      | 18 +++---
 .../include/multi/multi_remote_blob_tests.hpp | 36 +++++++++++
 .../ie_test_utils/multi/multi_helpers.hpp     | 64 +++++++++++++++++++
 14 files changed, 312 insertions(+), 51 deletions(-)
 create mode 100644 inference-engine/tests/functional/plugin/cpu/shared_tests_instances/multi/cpu_remote_blob_tests.cpp
 create mode 100644 inference-engine/tests/functional/plugin/gpu/shared_tests_instances/multi/gpu_remote_blob_tests.cpp
 create mode 100644 inference-engine/tests/functional/plugin/myriad/shared_tests_instances/myriad_remote_blobs_tests.cpp
 create mode 100644 inference-engine/tests/functional/plugin/shared/include/multi/multi_remote_blob_tests.hpp
 create mode 100644 inference-engine/tests/ie_test_utils/multi/multi_helpers.hpp

diff --git a/inference-engine/include/cpp/ie_infer_request.hpp b/inference-engine/include/cpp/ie_infer_request.hpp
index 8cae1255188..55085e2c106 100644
--- a/inference-engine/include/cpp/ie_infer_request.hpp
+++ b/inference-engine/include/cpp/ie_infer_request.hpp
@@ -14,6 +14,7 @@
 #include <string>
 
 #include "cpp/ie_memory_state.hpp"
+#include "ie_remote_context.hpp"
 #include "ie_iinfer_request.hpp"
 #include "details/ie_exception_conversion.hpp"
 #include "details/ie_so_loader.h"
@@ -123,8 +124,9 @@ public:
         CALL_STATUS_FNC(GetBlob, name.c_str(), data);
         std::string error = "Internal error: blob with name `" + name + "` is not allocated!";
         auto blobPtr = data.get();
+        const bool remoteBlobPassed = blobPtr->is<RemoteBlob>();
         if (blobPtr == nullptr) THROW_IE_EXCEPTION << error;
-        if (blobPtr->buffer() == nullptr) THROW_IE_EXCEPTION << error;
+        if (!remoteBlobPassed && blobPtr->buffer() == nullptr) THROW_IE_EXCEPTION << error;
         return data;
     }
 
diff --git a/inference-engine/src/multi_device/multi_device_async_infer_request.cpp b/inference-engine/src/multi_device/multi_device_async_infer_request.cpp
index 0761e2ae595..9c578e32666 100644
--- a/inference-engine/src/multi_device/multi_device_async_infer_request.cpp
+++ b/inference-engine/src/multi_device/multi_device_async_infer_request.cpp
@@ -22,6 +22,7 @@ MultiDeviceAsyncInferRequest::MultiDeviceAsyncInferRequest(
     _multiDeviceExecutableNetwork{multiDeviceExecutableNetwork},
     _inferRequest{inferRequest},
     _needPerfCounters{needPerfCounters} {
+    // this executor starts the inference while  the task (checking the result) is passed to the next stage
     struct ThisRequestExecutor : public ITaskExecutor {
         explicit ThisRequestExecutor(MultiDeviceAsyncInferRequest* _this_) : _this{_this_} {}
         void run(Task task) override {
@@ -32,22 +33,52 @@ MultiDeviceAsyncInferRequest::MultiDeviceAsyncInferRequest(
         MultiDeviceAsyncInferRequest* _this = nullptr;
     };
     _pipeline = {
-        {_multiDeviceExecutableNetwork, [this] {
-            _workerInferRequest = MultiDeviceExecutableNetwork::_thisWorkerInferRequest;
-            _inferRequest->SetBlobsToAnotherRequest(_workerInferRequest->_inferRequest);
+        // if the request is coming with device-specific remote blobs make sure it is scheduled to the specific device only:
+        { /*TaskExecutor*/ std::make_shared<ImmediateExecutor>(), /*task*/ [this] {
+               // by default, no preferred device:
+               _multiDeviceExecutableNetwork->_thisPreferredDeviceName = "";
+               // if any input is remote (e.g. was set with SetBlob), let' use the corresponding device
+               for (const auto &it : _multiDeviceExecutableNetwork->GetInputsInfo()) {
+                   Blob::Ptr b;
+                   _inferRequest->GetBlob(it.first.c_str(), b);
+                   auto r = b->as<RemoteBlob>();
+                   if (r) {
+                       const auto name = r->getDeviceName();
+                       const auto res = std::find_if(
+                               _multiDeviceExecutableNetwork->_devicePrioritiesInitial.cbegin(),
+                               _multiDeviceExecutableNetwork->_devicePrioritiesInitial.cend(),
+                               [&name](const MultiDevicePlugin::DeviceInformation& d){ return d.deviceName == name; });
+                       if (_multiDeviceExecutableNetwork->_devicePrioritiesInitial.cend() == res) {
+                           THROW_IE_EXCEPTION << "None of the devices (for which current MULTI-device configuration was "
+                                                 "initialized) supports a remote blob created on the device named " << name;
+
+                       } else {
+                            // it is ok to take the c_str() here (as pointed in the multi_device_exec_network.hpp we need to use const char*)
+                            // as the original strings are from the "persistent" vector (with the right lifetime)
+                           _multiDeviceExecutableNetwork->_thisPreferredDeviceName = res->deviceName.c_str();
+                           break;
+                       }
+                   }
+               }
         }},
-        {std::make_shared<ThisRequestExecutor>(this), [this] {
-            auto status = _workerInferRequest->_status;
-            if (InferenceEngine::StatusCode::OK != status) {
-                if (nullptr != InferenceEngine::CurrentException()) {
-                    std::rethrow_exception(InferenceEngine::CurrentException());
-                } else {
-                    THROW_IE_EXCEPTION << InferenceEngine::details::as_status << status;
-                }
-            }
-            if (_needPerfCounters) {
-                _perfMap = _workerInferRequest->_inferRequest.GetPerformanceCounts();
-            }
+        // as the scheduling algo may select any device, this stage accepts the scheduling decision (actual workerRequest)
+        // then sets the device-agnostic blobs to the actual (device-specific) request
+        {
+         /*TaskExecutor*/ _multiDeviceExecutableNetwork, /*task*/ [this] {
+               _workerInferRequest = MultiDeviceExecutableNetwork::_thisWorkerInferRequest;
+               _inferRequest->SetBlobsToAnotherRequest(_workerInferRequest->_inferRequest);
+        }},
+        // final task in the pipeline:
+        { /*TaskExecutor*/std::make_shared<ThisRequestExecutor>(this), /*task*/ [this] {
+              auto status = _workerInferRequest->_status;
+              if (InferenceEngine::StatusCode::OK != status) {
+                  if (nullptr != InferenceEngine::CurrentException())
+                      std::rethrow_exception(InferenceEngine::CurrentException());
+                  else
+                      THROW_IE_EXCEPTION << InferenceEngine::details::as_status << status;
+              }
+              if (_needPerfCounters)
+                  _perfMap = _workerInferRequest->_inferRequest.GetPerformanceCounts();
         }}
     };
 }
diff --git a/inference-engine/src/multi_device/multi_device_exec_network.cpp b/inference-engine/src/multi_device/multi_device_exec_network.cpp
index d9c1bf0a9b3..b8795376b51 100644
--- a/inference-engine/src/multi_device/multi_device_exec_network.cpp
+++ b/inference-engine/src/multi_device/multi_device_exec_network.cpp
@@ -3,9 +3,7 @@
 //
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
-#include <atomic>
 #include <mutex>
-#include <queue>
 #include <string>
 #include <vector>
 #include <memory>
@@ -27,6 +25,8 @@ namespace MultiDevicePlugin {
     using namespace InferenceEngine;
 
 thread_local MultiDeviceExecutableNetwork::WorkerInferRequest* MultiDeviceExecutableNetwork::_thisWorkerInferRequest = nullptr;
+// TODO: revert to the plain variable (see header file), when we moved to the next CentOS 8.x in our support matrix
+thread_local const char* MultiDeviceExecutableNetwork::_thisPreferredDeviceName = "";
 
 struct IdleGuard {
     explicit IdleGuard(MultiDeviceExecutableNetwork::WorkerInferRequest* workerInferRequestPtr,
@@ -68,7 +68,7 @@ MultiDeviceExecutableNetwork::MultiDeviceExecutableNetwork(const DeviceMap<Infer
         unsigned int optimalNum = 0;
         try {
             optimalNum = network.GetMetric(METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS)).as<unsigned int>();
-        } catch (const details::InferenceEngineException &iie) {
+        } catch (const InferenceEngine::details::InferenceEngineException &iie) {
             THROW_IE_EXCEPTION
                     << "Every device used with the Multi-Device should "
                     << "support OPTIMAL_NUMBER_OF_INFER_REQUESTS ExecutableNetwork metric. "
@@ -79,6 +79,7 @@ MultiDeviceExecutableNetwork::MultiDeviceExecutableNetwork(const DeviceMap<Infer
         auto& workerRequests = _workerRequests[device];
         auto& idleWorkerRequests = _idleWorkerRequests[device];
         workerRequests.resize(numRequests);
+        _inferPipelineTasksDeviceSpecific[device] = std::unique_ptr<ThreadSafeQueue<Task>>(new ThreadSafeQueue<Task>);
         auto* idleWorkerRequestsPtr = &(idleWorkerRequests);
         idleWorkerRequests.set_capacity(numRequests);
         for (auto&& workerRequest : workerRequests) {
@@ -95,24 +96,27 @@ MultiDeviceExecutableNetwork::MultiDeviceExecutableNetwork(const DeviceMap<Infer
                     }
                     // try to return the request to the idle list (fails if the overall object destruction has began)
                     if (idleGuard.Release()->try_push(workerRequestPtr)) {
+                        // let's try to pop a task, as we know there is at least one idle request, schedule if succeeded
+                        // if no device-agnostic tasks, let's try pop the device specific task, schedule if succeeded
                         Task t;
-                        // try pop the task, as we know there is at least one idle request
-                        if (_inferPipelineTasks.try_pop(t)) {
-                            // if succeeded, let's schedule that
+                        if (_inferPipelineTasks.try_pop(t))
                             ScheduleToWorkerInferRequest(std::move(t));
-                        }
+                        else if (_inferPipelineTasksDeviceSpecific[device]->try_pop(t))
+                            ScheduleToWorkerInferRequest(std::move(t), device);
                     }
                 });
         }
     }
 }
 
-void MultiDeviceExecutableNetwork::ScheduleToWorkerInferRequest(Task inferPipelineTask) {
+void MultiDeviceExecutableNetwork::ScheduleToWorkerInferRequest(Task inferPipelineTask, DeviceName preferred_device) {
     auto devices = [&] {
         std::lock_guard<std::mutex> lock(_mutex);
         return _devicePriorities;
     }();
     for (auto&& device : devices) {
+        if (!preferred_device.empty() && (device.deviceName != preferred_device))
+            continue;
         WorkerInferRequest* workerRequestPtr = nullptr;
         NotBusyWorkerRequests& idleWorkerRequests = _idleWorkerRequests[device.deviceName];
         if (idleWorkerRequests.try_pop(workerRequestPtr)) {
@@ -126,12 +130,15 @@ void MultiDeviceExecutableNetwork::ScheduleToWorkerInferRequest(Task inferPipeli
             return;
         }
     }
-    // no vacant requests this time, storing the task to the queue
-    _inferPipelineTasks.push(std::move(inferPipelineTask));
+    // no vacant requests this time, storing the task to the respective queue
+    if (!preferred_device.empty())
+        _inferPipelineTasksDeviceSpecific[preferred_device]->push(std::move(inferPipelineTask));
+    else
+        _inferPipelineTasks.push(std::move(inferPipelineTask));
 }
 
 void MultiDeviceExecutableNetwork::run(Task inferPipelineTask) {
-    ScheduleToWorkerInferRequest(std::move(inferPipelineTask));
+    ScheduleToWorkerInferRequest(std::move(inferPipelineTask), _thisPreferredDeviceName);
 }
 
 MultiDeviceExecutableNetwork::~MultiDeviceExecutableNetwork() {
@@ -149,6 +156,26 @@ MultiDeviceExecutableNetwork::~MultiDeviceExecutableNetwork() {
     _workerRequests.clear();
 }
 
+RemoteContext::Ptr MultiDeviceExecutableNetwork::GetContext() const {
+    auto devices = [&] {
+        std::lock_guard<std::mutex> lock(_mutex);
+        return _devicePriorities;
+    }();
+
+    std::string devices_names;
+    for (auto&& device : devices) {
+        devices_names += device.deviceName + " ";
+        const auto& n  = _networksPerDevice.at(device.deviceName);
+        try {
+            return n.GetContext();
+        } catch (const NotImplemented& ex) {
+        }
+    }
+    THROW_IE_EXCEPTION << InferenceEngine::details::as_status << StatusCode::NOT_IMPLEMENTED
+                       << NOT_IMPLEMENTED_str << "None of the devices in the MULTI has an associated remote context."
+                       << "Current list of devices allowed via the DEVICE_PRIORITIES config: " << devices_names;
+}
+
 InferenceEngine::InferRequestInternal::Ptr MultiDeviceExecutableNetwork::CreateInferRequestImpl(InferenceEngine::InputsDataMap networkInputs,
                                                                                                 InferenceEngine::OutputsDataMap networkOutputs) {
     auto num = _numRequestsCreated++;
@@ -230,7 +257,7 @@ InferenceEngine::Parameter MultiDeviceExecutableNetwork::GetMetric(const std::st
         for (auto n : _networksPerDevice) {
             try {
                 res += n.second.GetMetric(METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS)).as<unsigned int>();
-            } catch (const details::InferenceEngineException &iie) {
+            } catch (const InferenceEngine::details::InferenceEngineException &iie) {
                   THROW_IE_EXCEPTION
                         << "Every device used with the Multi-Device should "
                         << "support OPTIMAL_NUMBER_OF_INFER_REQUESTS ExecutableNetwork metric. "
diff --git a/inference-engine/src/multi_device/multi_device_exec_network.hpp b/inference-engine/src/multi_device/multi_device_exec_network.hpp
index bdea1e449e4..9251f892d1c 100644
--- a/inference-engine/src/multi_device/multi_device_exec_network.hpp
+++ b/inference-engine/src/multi_device/multi_device_exec_network.hpp
@@ -117,17 +117,22 @@ public:
     InferenceEngine::IInferRequest::Ptr CreateInferRequest() override;
     InferenceEngine::InferRequestInternal::Ptr CreateInferRequestImpl(InferenceEngine::InputsDataMap networkInputs,
                                                                       InferenceEngine::OutputsDataMap networkOutputs) override;
+    InferenceEngine::RemoteContext::Ptr GetContext() const override;
     ~MultiDeviceExecutableNetwork() override;
 
-    void ScheduleToWorkerInferRequest(InferenceEngine::Task);
+    void ScheduleToWorkerInferRequest(InferenceEngine::Task, DeviceName preferred_device = "");
 
     static thread_local WorkerInferRequest*                     _thisWorkerInferRequest;
-    std::atomic_bool                                            _terminate = {false};
-    std::mutex                                                  _mutex;
+    // have to use the const char* ptr rather than std::string due to a bug in old gcc versions,
+    // the bug is e.g. manifesting on the old CentOS (and it's 4.8.x gcc) used in our testing
+    // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81880
+    static thread_local const char*                             _thisPreferredDeviceName;
+    mutable std::mutex                                          _mutex;
     std::vector<DeviceInformation>                              _devicePriorities;
     const std::vector<DeviceInformation>                        _devicePrioritiesInitial;
     DeviceMap<InferenceEngine::ExecutableNetwork>               _networksPerDevice;
     ThreadSafeQueue<InferenceEngine::Task>                      _inferPipelineTasks;
+    DeviceMap<std::unique_ptr<ThreadSafeQueue<InferenceEngine::Task>>> _inferPipelineTasksDeviceSpecific;
     DeviceMap<NotBusyWorkerRequests>                            _idleWorkerRequests;
     DeviceMap<std::vector<WorkerInferRequest>>                  _workerRequests;
     std::unordered_map<std::string, InferenceEngine::Parameter> _config;
diff --git a/inference-engine/src/plugin_api/cpp_interfaces/impl/ie_executable_network_internal.hpp b/inference-engine/src/plugin_api/cpp_interfaces/impl/ie_executable_network_internal.hpp
index c2e70b5bf73..765fe365d88 100644
--- a/inference-engine/src/plugin_api/cpp_interfaces/impl/ie_executable_network_internal.hpp
+++ b/inference-engine/src/plugin_api/cpp_interfaces/impl/ie_executable_network_internal.hpp
@@ -64,7 +64,7 @@ public:
 
     void Export(const std::string& modelFileName) override {
         (void)modelFileName;
-        THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str;
+        THROW_IE_EXCEPTION << InferenceEngine::details::as_status << StatusCode::NOT_IMPLEMENTED << NOT_IMPLEMENTED_str;
     }
 
     void Export(std::ostream& networkModel) override {
@@ -76,7 +76,7 @@ public:
     }
 
     CNNNetwork GetExecGraphInfo() override {
-        THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str;
+        THROW_IE_EXCEPTION << InferenceEngine::details::as_status << StatusCode::NOT_IMPLEMENTED << NOT_IMPLEMENTED_str;
     }
 
     /**
@@ -89,7 +89,7 @@ public:
     }
 
     std::vector<IVariableStateInternal::Ptr> QueryState() override {
-        THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str;
+        THROW_IE_EXCEPTION << InferenceEngine::details::as_status << StatusCode::NOT_IMPLEMENTED << NOT_IMPLEMENTED_str;
     }
 
     void SetConfig(const std::map<std::string, Parameter>& config) override {
@@ -107,11 +107,11 @@ public:
 
     Parameter GetMetric(const std::string& name) const override {
         (void)name;
-        THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str;
+        THROW_IE_EXCEPTION << InferenceEngine::details::as_status << StatusCode::NOT_IMPLEMENTED << NOT_IMPLEMENTED_str;
     }
 
     RemoteContext::Ptr GetContext() const override {
-        THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str;
+        THROW_IE_EXCEPTION << InferenceEngine::details::as_status << StatusCode::NOT_IMPLEMENTED << NOT_IMPLEMENTED_str;
     }
 
 protected:
@@ -123,7 +123,7 @@ protected:
      */
     virtual void ExportImpl(std::ostream& networkModel) {
         (void)networkModel;
-        THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str;
+        THROW_IE_EXCEPTION << InferenceEngine::details::as_status << StatusCode::NOT_IMPLEMENTED << NOT_IMPLEMENTED_str;
     }
 
     InferenceEngine::InputsDataMap _networkInputs;  //!< Holds infromation about network inputs info
diff --git a/inference-engine/src/plugin_api/cpp_interfaces/impl/ie_infer_request_internal.hpp b/inference-engine/src/plugin_api/cpp_interfaces/impl/ie_infer_request_internal.hpp
index f0d5316686b..7add8e862a7 100644
--- a/inference-engine/src/plugin_api/cpp_interfaces/impl/ie_infer_request_internal.hpp
+++ b/inference-engine/src/plugin_api/cpp_interfaces/impl/ie_infer_request_internal.hpp
@@ -76,7 +76,8 @@ public:
         }
         if (!data) THROW_IE_EXCEPTION << NOT_ALLOCATED_str << "Failed to set empty blob with name: \'" << name << "\'";
         const bool compoundBlobPassed = data->is<CompoundBlob>();
-        if (!compoundBlobPassed && data->buffer() == nullptr)
+        const bool remoteBlobPassed = data->is<RemoteBlob>();
+        if (!compoundBlobPassed && !remoteBlobPassed && data->buffer() == nullptr)
             THROW_IE_EXCEPTION << "Input data was not allocated. Input name: \'" << name << "\'";
         if (data->size() == 0) {
             THROW_IE_EXCEPTION << "Input data is empty. Input name: \'" << name << "\'";
@@ -348,7 +349,8 @@ protected:
         if (refSize != blob->size()) {
             THROW_IE_EXCEPTION << strNotMatched + ": got " << blob->size() << " expecting " << refSize;
         }
-        if (blob->buffer() == nullptr) THROW_IE_EXCEPTION << strNotAllocated;
+        const bool remoteBlobPassed = blob->is<RemoteBlob>();
+        if (!remoteBlobPassed && blob->buffer() == nullptr) THROW_IE_EXCEPTION << strNotAllocated;
     }
 
     /**
diff --git a/inference-engine/tests/functional/plugin/CMakeLists.txt b/inference-engine/tests/functional/plugin/CMakeLists.txt
index 339d97fa430..9b1aae9cff0 100644
--- a/inference-engine/tests/functional/plugin/CMakeLists.txt
+++ b/inference-engine/tests/functional/plugin/CMakeLists.txt
@@ -18,4 +18,5 @@ endif()
 
 if (ENABLE_MYRIAD)
     add_subdirectory(myriad)
-endif()
\ No newline at end of file
+endif()
+
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/multi/cpu_remote_blob_tests.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/multi/cpu_remote_blob_tests.cpp
new file mode 100644
index 00000000000..5f52d3f4afa
--- /dev/null
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/multi/cpu_remote_blob_tests.cpp
@@ -0,0 +1,15 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <string>
+#include <vector>
+#include "multi/multi_remote_blob_tests.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+const std::vector<DevicesNamesAndSupportPair> device_names_and_support_for_remote_blobs {
+        {{CPU}, false}, // CPU via MULTI
+};
+
+INSTANTIATE_TEST_CASE_P(smoke_RemoteBlobMultiCPU, MultiDevice_SupportTest,
+        ::testing::ValuesIn(device_names_and_support_for_remote_blobs), MultiDevice_SupportTest::getTestCaseName);
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/multi/gpu_remote_blob_tests.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/multi/gpu_remote_blob_tests.cpp
new file mode 100644
index 00000000000..7a576dad03d
--- /dev/null
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/multi/gpu_remote_blob_tests.cpp
@@ -0,0 +1,57 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <string>
+#include <vector>
+#include "multi/multi_remote_blob_tests.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+const std::vector<DevicesNamesAndSupportPair> device_names_and_support_for_remote_blobs {
+        {{GPU}, true}, // GPU via MULTI,
+#if ENABLE_MKL_DNN
+        {{GPU, CPU}, true}, // GPU+CPU
+        {{CPU, GPU}, true}, // CPU+GPU
+#endif
+};
+
+INSTANTIATE_TEST_CASE_P(smoke_RemoteBlobMultiGPU, MultiDevice_SupportTest,
+                        ::testing::ValuesIn(device_names_and_support_for_remote_blobs), MultiDevice_SupportTest::getTestCaseName);
+
+TEST_P(MultiDevice_Test, cannotInferRemoteBlobIfNotInitializedForDevice) {
+    InferenceEngine::CNNNetwork net;
+    net = CNNNetwork(fn_ptr);
+    auto ie = PluginCache::get().ie();
+    // load a network to the GPU to make sure we have a remote context
+    auto exec_net = ie->LoadNetwork(net, GPU);
+    auto ctx = exec_net.GetContext();
+
+    const InferenceEngine::ConstInputsDataMap inputInfo = exec_net.GetInputsInfo();
+    auto& first_input_name = inputInfo.begin()->first;
+    auto& first_input = inputInfo.begin()->second;
+    auto rblob = InferenceEngine::make_shared_blob(first_input->getTensorDesc(), ctx);
+    rblob->allocate();
+
+    ExecutableNetwork exec_net_multi;
+    try {
+        exec_net_multi = ie->LoadNetwork(net, device_names);
+    } catch(...) {
+        // device is unavailable (e.g. for the "second GPU" test) or other (e.g. env) issues not related to the test
+        return;
+    }
+    InferRequest req = exec_net_multi.CreateInferRequest();
+    ASSERT_NE((std::shared_ptr<InferenceEngine::IInferRequest>)req, nullptr);
+    ASSERT_NO_THROW(req.SetBlob(first_input_name, rblob));
+    ASSERT_NO_THROW(req.StartAsync());
+    ASSERT_THROW(req.Wait(InferenceEngine::IInferRequest::WaitMode::RESULT_READY), InferenceEngine::details::InferenceEngineException);
+}
+
+const std::vector<DevicesNames> device_names_and_support_for_remote_blobs2 {
+#if ENABLE_MKL_DNN
+        {CPU},  // stand-alone CPU via MULTI (no GPU), no OCL context
+#endif
+        {"GPU.1"},  // another GPU (the test will test its presence), different OCL contexts
+};
+
+INSTANTIATE_TEST_CASE_P(smoke_RemoteBlobMultiInitializedWithoutGPU, MultiDevice_Test,
+                        ::testing::ValuesIn(device_names_and_support_for_remote_blobs2), MultiDevice_Test::getTestCaseName);
diff --git a/inference-engine/tests/functional/plugin/myriad/shared_tests_instances/myriad_remote_blobs_tests.cpp b/inference-engine/tests/functional/plugin/myriad/shared_tests_instances/myriad_remote_blobs_tests.cpp
new file mode 100644
index 00000000000..49e442cf117
--- /dev/null
+++ b/inference-engine/tests/functional/plugin/myriad/shared_tests_instances/myriad_remote_blobs_tests.cpp
@@ -0,0 +1,18 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <string>
+#include <vector>
+#include "multi/multi_remote_blob_tests.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+const std::vector<DevicesNamesAndSupportPair> device_names_and_support_for_remote_blobs {
+        {{MYRIAD}, false}, // MYX via MULTI
+#if ENABLE_MKL_DNN
+        {{CPU, MYRIAD}, false},  // CPU+MYX
+#endif
+};
+
+INSTANTIATE_TEST_CASE_P(smoke_RemoteBlobMultiMyriad, MultiDevice_SupportTest,
+                        ::testing::ValuesIn(device_names_and_support_for_remote_blobs), MultiDevice_SupportTest::getTestCaseName);
\ No newline at end of file
diff --git a/inference-engine/tests/functional/plugin/shared/include/behavior/core_integration.hpp b/inference-engine/tests/functional/plugin/shared/include/behavior/core_integration.hpp
index 4aff2bb283e..769a6843590 100644
--- a/inference-engine/tests/functional/plugin/shared/include/behavior/core_integration.hpp
+++ b/inference-engine/tests/functional/plugin/shared/include/behavior/core_integration.hpp
@@ -23,6 +23,7 @@
 #include <functional_test_utils/skip_tests_config.hpp>
 #include <common_test_utils/common_utils.hpp>
 #include <common_test_utils/test_assertions.hpp>
+#include <cpp_interfaces/exception2status.hpp>
 
 #ifdef ENABLE_UNICODE_PATH_SUPPORT
 #include <iostream>
@@ -60,16 +61,18 @@ namespace BehaviorTestsDefinitions {
 {                                                                               \
     try {                                                                       \
         __VA_ARGS__;                                                            \
-    } catch(InferenceEngine::details::InferenceEngineException ieException) {   \
+    } catch(InferenceEngine::details::InferenceEngineException& ieException) {  \
         auto notImplementedExceptionIsThrown =                                  \
             std::string::npos != std::string {ieException.what()}               \
-            .find(std::string{"[NOT_IMPLEMENTED] "});                           \
+            .find(NOT_IMPLEMENTED_str);                                         \
         if (notImplementedExceptionIsThrown) {                                  \
             GTEST_SKIP();                                                       \
         } else {                                                                \
             FAIL() << "thrown from expression: " # __VA_ARGS__ << std::endl     \
             << "what: " << ieException.what();                                  \
         }                                                                       \
+    } catch (const InferenceEngine::NotImplemented& ex) {                       \
+        GTEST_SKIP();                                                           \
     }                                                                           \
 }
 
diff --git a/inference-engine/tests/functional/plugin/shared/include/behavior/exec_graph_info.hpp b/inference-engine/tests/functional/plugin/shared/include/behavior/exec_graph_info.hpp
index e010f3a4e9d..bc060854e77 100644
--- a/inference-engine/tests/functional/plugin/shared/include/behavior/exec_graph_info.hpp
+++ b/inference-engine/tests/functional/plugin/shared/include/behavior/exec_graph_info.hpp
@@ -61,7 +61,7 @@ TEST_P(ExecGraphTests, CheckExecGraphInfoBeforeExecution) {
     InferenceEngine::CNNNetwork execGraph;
     if (targetDevice != CommonTestUtils::DEVICE_MULTI && targetDevice != CommonTestUtils::DEVICE_GNA) {
         // Load CNNNetwork to target plugins
-        auto execNet = ie->LoadNetwork(cnnNet, targetDevice);
+        auto execNet = ie->LoadNetwork(cnnNet, targetDevice, configuration);
         ASSERT_NO_THROW(execGraph = execNet.GetExecGraphInfo());
         // Create InferRequest
         InferenceEngine::InferRequest req;
@@ -135,8 +135,8 @@ TEST_P(ExecGraphTests, CheckExecGraphInfoBeforeExecution) {
             ASSERT_GE(layer.second, 0);
         }
     } else {
-        ASSERT_THROW(ie->LoadNetwork(cnnNet, targetDevice).GetExecGraphInfo(),
-                InferenceEngine::details::InferenceEngineException);
+        ASSERT_THROW(ie->LoadNetwork(cnnNet, targetDevice, configuration).GetExecGraphInfo(),
+                InferenceEngine::NotImplemented);
     }
 }
 
@@ -148,7 +148,7 @@ TEST_P(ExecGraphTests, CheckExecGraphInfoAfterExecution) {
     InferenceEngine::CNNNetwork execGraph;
     if (targetDevice != CommonTestUtils::DEVICE_MULTI && targetDevice != CommonTestUtils::DEVICE_GNA) {
         // Load CNNNetwork to target plugins
-        auto execNet = ie->LoadNetwork(cnnNet, targetDevice);
+        auto execNet = ie->LoadNetwork(cnnNet, targetDevice, configuration);
         ASSERT_NO_THROW(execGraph = execNet.GetExecGraphInfo());
         // Create InferRequest
         InferenceEngine::InferRequest req;
@@ -235,8 +235,8 @@ TEST_P(ExecGraphTests, CheckExecGraphInfoAfterExecution) {
             ASSERT_GE(layer.second, 0);
         }
     } else {
-        ASSERT_THROW(ie->LoadNetwork(cnnNet, targetDevice).GetExecGraphInfo(),
-                InferenceEngine::details::InferenceEngineException);
+        ASSERT_THROW(ie->LoadNetwork(cnnNet, targetDevice, configuration).GetExecGraphInfo(),
+                InferenceEngine::NotImplemented);
     }
 }
 
@@ -252,7 +252,7 @@ TEST_P(ExecGraphTests, CheckExecGraphInfoSerialization) {
     InferenceEngine::CNNNetwork execGraph;
     if (targetDevice != CommonTestUtils::DEVICE_MULTI && targetDevice != CommonTestUtils::DEVICE_GNA) {
         // Load CNNNetwork to target plugins
-        auto execNet = ie->LoadNetwork(cnnNet, targetDevice);
+        auto execNet = ie->LoadNetwork(cnnNet, targetDevice, configuration);
         ASSERT_NO_THROW(execGraph = execNet.GetExecGraphInfo());
         // Create InferRequest
         InferenceEngine::InferRequest req;
@@ -261,8 +261,8 @@ TEST_P(ExecGraphTests, CheckExecGraphInfoSerialization) {
         ASSERT_EQ(0, std::remove(out_xml_path.c_str()));
         ASSERT_EQ(0, std::remove(out_bin_path.c_str()));
     } else {
-        ASSERT_THROW(ie->LoadNetwork(cnnNet, targetDevice).GetExecGraphInfo(),
-                     InferenceEngine::details::InferenceEngineException);
+        ASSERT_THROW(ie->LoadNetwork(cnnNet, targetDevice, configuration).GetExecGraphInfo(),
+                     InferenceEngine::NotImplemented);
     }
 }
 }  // namespace BehaviorTestsDefinitions
\ No newline at end of file
diff --git a/inference-engine/tests/functional/plugin/shared/include/multi/multi_remote_blob_tests.hpp b/inference-engine/tests/functional/plugin/shared/include/multi/multi_remote_blob_tests.hpp
new file mode 100644
index 00000000000..89d38804451
--- /dev/null
+++ b/inference-engine/tests/functional/plugin/shared/include/multi/multi_remote_blob_tests.hpp
@@ -0,0 +1,36 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <string>
+#include <vector>
+#include "multi/multi_helpers.hpp"
+#include "functional_test_utils/plugin_cache.hpp"
+
+TEST_P(MultiDevice_SupportTest, canCreateContextThenRequestThenBlobsAndInfer) {
+    InferenceEngine::CNNNetwork net;
+    net = CNNNetwork(fn_ptr);
+    net.getInputsInfo().begin()->second->setLayout(Layout::NCHW);
+    net.getInputsInfo().begin()->second->setPrecision(Precision::U8);
+
+    auto ie = PluginCache::get().ie();
+
+    auto exec_net = ie->LoadNetwork(net, device_names);
+    if (expected_status) {
+        InferenceEngine::RemoteContext::Ptr ctx;
+        ASSERT_NE(ctx = exec_net.GetContext(), nullptr);
+        InferRequest req = exec_net.CreateInferRequest();
+        ASSERT_NE((std::shared_ptr<InferenceEngine::IInferRequest>)req, nullptr);
+        const InferenceEngine::ConstInputsDataMap inputInfo = exec_net.GetInputsInfo();
+        for (auto i : inputInfo) {
+            auto rblob = InferenceEngine::make_shared_blob(i.second->getTensorDesc(), ctx);
+            rblob->allocate();
+            req.SetBlob(i.first, rblob);
+        }
+        ASSERT_NO_THROW(req.StartAsync());
+        ASSERT_EQ(req.Wait(IInferRequest::RESULT_READY), StatusCode::OK);
+
+    } else {
+        ASSERT_THROW(exec_net.GetContext(), InferenceEngine::NotImplemented);
+    }
+}
diff --git a/inference-engine/tests/ie_test_utils/multi/multi_helpers.hpp b/inference-engine/tests/ie_test_utils/multi/multi_helpers.hpp
new file mode 100644
index 00000000000..064c75006db
--- /dev/null
+++ b/inference-engine/tests/ie_test_utils/multi/multi_helpers.hpp
@@ -0,0 +1,64 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <string>
+#include <vector>
+
+#include "multi-device/multi_device_config.hpp"
+#include "common_test_utils/test_common.hpp"
+#include "common_test_utils/test_constants.hpp"
+#include "ngraph_functions/subgraph_builders.hpp"
+
+using namespace ::testing;
+using namespace InferenceEngine;
+
+static std::string getDeviceStringWithMulti(std::vector<std::string> names) {
+    std::string allDevices = "MULTI:";
+    for (auto && device : names) {
+        allDevices += device;
+        allDevices += ((device == names[names.size()-1]) ? "" : ",");
+    }
+    return allDevices;
+}
+using DeviceName = std::string;
+using DevicesNames = std::vector<DeviceName>;
+using DevicesNamesAndSupportPair = std::pair<DevicesNames, bool>;
+
+class MultiDevice_Test : public CommonTestUtils::TestsCommon, public testing::WithParamInterface<DevicesNames> {
+    void SetUp() override {
+        device_names = getDeviceStringWithMulti(this->GetParam());
+        fn_ptr = ngraph::builder::subgraph::makeSplitMultiConvConcat();
+    }
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<DevicesNames> &obj) {
+        auto s = getDeviceStringWithMulti(obj.param);
+        std::replace(s.begin(), s.end(), ',', '_');
+        return "device_names_" + s;
+    }
+protected:
+    std::string device_names;
+    std::shared_ptr<ngraph::Function> fn_ptr;
+};
+
+class MultiDevice_SupportTest : public CommonTestUtils::TestsCommon, public testing::WithParamInterface<DevicesNamesAndSupportPair> {
+    void SetUp() override {
+        device_names = getDeviceStringWithMulti(this->GetParam().first);
+        expected_status = this->GetParam().second;
+        fn_ptr = ngraph::builder::subgraph::makeSplitMultiConvConcat();
+    }
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<DevicesNamesAndSupportPair> &obj) {
+        auto s = getDeviceStringWithMulti(obj.param.first);
+        std::replace(s.begin(), s.end(), ',', '_');
+        return "device_names_" + s;
+    }
+protected:
+    std::string device_names;
+    bool expected_status;
+    std::shared_ptr<ngraph::Function> fn_ptr;
+};
+#define MULTI  CommonTestUtils::DEVICE_MULTI
+#define CPU    CommonTestUtils::DEVICE_CPU
+#define GPU    CommonTestUtils::DEVICE_GPU
+#define MYRIAD CommonTestUtils::DEVICE_MYRIAD