Add new compile model api to support hash model memory (#14543)

* Add new compile_model api for ONNX RUNTIME OV EP Allow compile_model() accept model/weight data. * Update minor place * Cache model if possible * Compute hash based on model_xml and model_weight * Update typo * Change hash key computation for model's weights * Resolve test case issue * Use tensor replace blob for hash computation * Fix hash computation isssue and add more test cases * Fix a build issue caused by data format
2023-01-10 16:32:34 +08:00 · 2023-01-10 16:32:34 +08:00 · 246a287c34
commit 246a287c34
parent b64c1ff20a
11 changed files with 352 additions and 0 deletions
--- a/src/inference/dev_api/ie_icore.hpp
+++ b/src/inference/dev_api/ie_icore.hpp
@ -99,6 +99,27 @@ public:
        const std::map<std::string, std::string>& config,
        const std::function<void(const ie::CNNNetwork&)>& val = nullptr) = 0;
    /**
     * @brief Creates an executable network from a model memory.
     *
     * Users can create as many networks as they need and use
     *        them simultaneously (up to the limitation of the hardware resources)
     *
     * @param modelStr String data of model
     * @param weights Model's weights
     * @param deviceName Name of device to load network to
     * @param config Optional map of pairs: (config parameter name, config parameter value) relevant only for this load
     * operation
     * @param val Optional callback to perform validation of loaded CNNNetwork, if ReadNetwork is triggered
     * @return An executable network reference
     */
    virtual ie::SoExecutableNetworkInternal LoadNetwork(
        const std::string& modelStr,
        const ie::Blob::CPtr& weights,
        const std::string& deviceName,
        const std::map<std::string, std::string>& config,
        const std::function<void(const ie::CNNNetwork&)>& val = nullptr) = 0;
    /**
     * @brief Creates an executable network from a previously exported network
     * @param networkModel network model stream
--- a/src/inference/include/openvino/runtime/core.hpp
+++ b/src/inference/include/openvino/runtime/core.hpp
@ -255,6 +255,44 @@ public:
        return compile_model(model_path, device_name, AnyMap{std::forward<Properties>(properties)...});
    }
    /**
     * @brief Reads a model and creates a compiled model from the IR/ONNX/PDPD memory.
     * @param model String with a model in IR/ONNX/PDPD format.
     * @param weights Shared pointer to a constant tensor with weights.
     * Reading ONNX/PDPD models does not support loading weights from the @p weights tensors.
     * @param device_name Name of a device to load a model to.
     * @param properties Optional map of pairs: (property name, property value) relevant only for this load
     * operation.
     * @note Created model object shares the weights with the @p weights object.
     * Thus, do not create @p weights on temporary data that can be freed later, since the model
     * constant data will point to an invalid memory.
     * @return A compiled model.
     */
    CompiledModel compile_model(const std::string& model,
                                const ov::Tensor& weights,
                                const std::string& device_name,
                                const AnyMap& properties = {});
    /**
     * @brief Reads a model and creates a compiled model from the IR/ONNX/PDPD memory.
     * @param model String with a model in IR/ONNX/PDPD format.
     * @param weights Shared pointer to a constant tensor with weights.
     * Reading ONNX/PDPD models does not support loading weights from the @p weights tensors.
     * @param device_name Name of a device to load a model to.
     * @tparam Properties Should be a pack of `std::pair<std::string, ov::Any>` types.
     * @note Created model object shares the weights with the @p weights object.
     * Thus, do not create @p weights on temporary data that can be freed later, since the model
     * constant data will point to an invalid memory.
     * @return A compiled model.
     */
    template <typename... Properties>
    util::EnableIfAllStringAny<CompiledModel, Properties...> compile_model(const std::string& model,
                                                                           const ov::Tensor& weights,
                                                                           const std::string& device_name,
                                                                           Properties&&... properties) {
        return compile_model(model, weights, device_name, AnyMap{std::forward<Properties>(properties)...});
    }
    /**
     * @brief Creates a compiled model from a source model within a specified remote context.
     * @param model Model object acquired from Core::read_model.
--- a/src/inference/src/compilation_context.cpp
+++ b/src/inference/src/compilation_context.cpp
@ -140,6 +140,34 @@ std::string NetworkCompilationContext::computeHash(const std::string& modelName,
    return std::to_string(seed);
 }
 std::string NetworkCompilationContext::computeHash(const std::string& modelStr,
                                                   const ov::Tensor& tensor,
                                                   const std::map<std::string, std::string>& compileOptions) {
    OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::IE_LT, "NetworkCompilationContext::computeHash - Model Memory");
    uint64_t seed = 0;
    // model string
    seed = hash_combine(seed, modelStr);
    // tensor data
    seed = hash_combine(seed, tensor.get_size());
    auto ptr = static_cast<size_t*>(tensor.data());
    size_t size = tensor.get_size() / sizeof(size_t);
    for (size_t i = 0; i < size; i++)
        seed = hash_combine(seed, ptr[i]);
    auto size_done = size * sizeof(size_t);
    auto ptr_left = static_cast<uint8_t*>(tensor.data()) + size_done;
    size_t size_left = tensor.get_size() - size_done;
    for (size_t i = 0; i < size_left; i++)
        seed = hash_combine(seed, ptr_left[i]);
    // compile options
    for (const auto& kvp : compileOptions) {
        seed = hash_combine(seed, kvp.first + kvp.second);
    }
    return std::to_string(seed);
 }
 //////////////////////////////////////////////////
 CompiledBlobHeader::CompiledBlobHeader() {}
--- a/src/inference/src/compilation_context.hpp
+++ b/src/inference/src/compilation_context.hpp
@ -9,6 +9,10 @@
 #include <ostream>
 #include <string>
 namespace ov {
 class Tensor;
 }  // namespace ov
 namespace InferenceEngine {
 class CNNNetwork;
@ -20,6 +24,9 @@ struct NetworkCompilationContext final {
    static std::string computeHash(const std::string& modelName,
                                   const std::map<std::string, std::string>& compileOptions);
    static std::string computeHash(const std::string& modeStr,
                                   const ov::Tensor& data,
                                   const std::map<std::string, std::string>& compileOptions);
 };
 class CompiledBlobHeader final {
--- a/src/inference/src/ie_core.cpp
+++ b/src/inference/src/ie_core.cpp
@ -546,6 +546,15 @@ class CoreImpl : public ie::ICore, public std::enable_shared_from_this<ie::ICore
        return ie::NetworkCompilationContext::computeHash(modelName, compileConfig);
    }
    std::string CalculateMemoryHash(const std::string& modelStr,
                                    const ov::Tensor& weights,
                                    const std::string& deviceFamily,
                                    const ov::InferencePlugin& plugin,
                                    const std::map<std::string, std::string>& config) const {
        auto compileConfig = CreateCompileConfig(plugin, deviceFamily, config);
        return ie::NetworkCompilationContext::computeHash(modelStr, weights, compileConfig);
    }
 public:
    CoreImpl(bool _newAPI) : newAPI(_newAPI) {
        add_mutex("");  // Register global mutex
@ -884,6 +893,46 @@ public:
        return {res._ptr, res._so};
    }
    ie::SoExecutableNetworkInternal LoadNetwork(const std::string& modelStr,
                                                const ie::Blob::CPtr& weights,
                                                const std::string& deviceName,
                                                const std::map<std::string, std::string>& config,
                                                const std::function<void(const CNNNetwork&)>& val = nullptr) override {
        OV_ITT_SCOPE(FIRST_INFERENCE, ie::itt::domains::IE_LT, "Core::LoadNetwork::Memory");
        auto parsed = parseDeviceNameIntoConfig(deviceName, config);
        auto plugin = GetCPPPluginByName(parsed._deviceName);
        ov::SoPtr<ie::IExecutableNetworkInternal> res;
        auto cacheManager =
            coreConfig.getCacheConfigForDevice(parsed._deviceName, DeviceSupportsCacheDir(plugin), parsed._config)
                ._cacheManager;
        auto cacheContent = CacheContent{cacheManager};
        if (cacheManager && DeviceSupportsImportExport(plugin)) {
            bool loadedFromCache = false;
            ov::Tensor tensor = ov::Tensor();
            if (weights) {
                tensor = ov::Tensor(element::u8, {weights->byteSize()}, weights->cbuffer().as<uint8_t*>());
            }
            cacheContent.blobId = CalculateMemoryHash(modelStr, tensor, parsed._deviceName, plugin, parsed._config);
            auto lock = cacheGuard.getHashLock(cacheContent.blobId);
            res = LoadNetworkFromCache(cacheContent, plugin, parsed._config, nullptr, loadedFromCache);
            if (!loadedFromCache) {
                auto cnnNetwork = ReadNetwork(modelStr, weights);
                if (val) {
                    val(cnnNetwork);
                }
                res = compile_model_impl(cnnNetwork, plugin, parsed._config, nullptr, cacheContent);
            }
        } else {
            auto cnnNetwork = ReadNetwork(modelStr, weights);
            if (val) {
                val(cnnNetwork);
            }
            res = compile_model_impl(cnnNetwork, plugin, parsed._config, nullptr, cacheContent);
        }
        return {res._ptr, res._so};
    }
    ie::SoExecutableNetworkInternal ImportNetwork(std::istream& networkModel,
                                                  const std::string& deviceName,
                                                  const std::map<std::string, std::string>& config) override {
@ -2003,6 +2052,20 @@ CompiledModel Core::compile_model(const std::string& modelPath, const std::strin
    });
 }
 CompiledModel Core::compile_model(const std::string& model,
                                  const ov::Tensor& weights,
                                  const std::string& deviceName,
                                  const AnyMap& config) {
    InferenceEngine::Blob::Ptr blob;
    if (weights) {
        blob = weights._impl;
    }
    OV_CORE_CALL_STATEMENT({
        auto exec = _impl->LoadNetwork(model, blob, deviceName, any_copy(flatten_sub_properties(deviceName, config)));
        return {exec._ptr, exec._so};
    });
 }
 CompiledModel Core::compile_model(const std::shared_ptr<const ov::Model>& model,
                                  const RemoteContext& context,
                                  const AnyMap& config) {
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/behavior/ov_plugin/caching_tests.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/behavior/ov_plugin/caching_tests.cpp
@ -173,4 +173,10 @@ namespace {
                                ::testing::ValuesIn(TestTargets),
                                ::testing::ValuesIn(LoadFromFileConfigs)),
                        CompileModelLoadFromFileTestBase::getTestCaseName);
    INSTANTIATE_TEST_SUITE_P(smoke_Auto_CachingSupportCase_CPU,
                             CompileModelLoadFromMemoryTestBase,
                             ::testing::Combine(::testing::ValuesIn(TestTargets),
                                                ::testing::ValuesIn(LoadFromFileConfigs)),
                             CompileModelLoadFromMemoryTestBase::getTestCaseName);
 } // namespace
--- a/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/ov_plugin/caching_tests.cpp
+++ b/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/ov_plugin/caching_tests.cpp
@ -83,6 +83,12 @@ namespace {
                                ::testing::ValuesIn(LoadFromFileConfigs)),
                        CompileModelLoadFromFileTestBase::getTestCaseName);
    INSTANTIATE_TEST_SUITE_P(smoke_Auto_CachingSupportCase_GPU,
                             CompileModelLoadFromMemoryTestBase,
                             ::testing::Combine(::testing::ValuesIn(TestTargets),
                                                ::testing::ValuesIn(LoadFromFileConfigs)),
                             CompileModelLoadFromMemoryTestBase::getTestCaseName);
    const std::vector<ov::AnyMap> GPULoadFromFileConfigs = {
        {ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT)},
        {ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY)},
@ -94,4 +100,9 @@ namespace {
                                ::testing::ValuesIn(GPULoadFromFileConfigs)),
                        CompileModelLoadFromFileTestBase::getTestCaseName);
    INSTANTIATE_TEST_SUITE_P(smoke_CachingSupportCase_GPU,
                             CompileModelLoadFromMemoryTestBase,
                             ::testing::Combine(::testing::Values(CommonTestUtils::DEVICE_GPU),
                                                ::testing::ValuesIn(GPULoadFromFileConfigs)),
                             CompileModelLoadFromMemoryTestBase::getTestCaseName);
 } // namespace
--- a/src/tests/functional/plugin/shared/include/behavior/ov_executable_network/exec_network_base.hpp
+++ b/src/tests/functional/plugin/shared/include/behavior/ov_executable_network/exec_network_base.hpp
@ -84,6 +84,64 @@ TEST_P(OVExecutableNetworkBaseTest, canLoadCorrectNetworkToGetExecutable) {
    EXPECT_NO_THROW(auto execNet = core->compile_model(function, target_device, configuration));
 }
 TEST_P(OVExecutableNetworkBaseTest, canLoadNetworkFromMemory) {
    std::string model = R"V0G0N(
        <net name="Network" version="10">
            <layers>
                <layer name="in1" type="Parameter" id="0" version="opset8">
                    <data element_type="f16" shape="1,3,22,22"/>
                    <output>
                        <port id="0" precision="FP16" names="data">
                            <dim>1</dim>
                            <dim>3</dim>
                            <dim>22</dim>
                            <dim>22</dim>
                        </port>
                    </output>
                </layer>
                <layer name="round" id="1" type="Round" version="opset8">
                    <data mode="half_to_even"/>
                    <input>
                        <port id="1" precision="FP16">
                            <dim>1</dim>
                            <dim>3</dim>
                            <dim>22</dim>
                            <dim>22</dim>
                        </port>
                    </input>
                    <output>
                        <port id="2" precision="FP16" names="r">
                            <dim>1</dim>
                            <dim>3</dim>
                            <dim>22</dim>
                            <dim>22</dim>
                        </port>
                    </output>
                </layer>
                <layer name="output" type="Result" id="2" version="opset8">
                    <input>
                        <port id="0" precision="FP16">
                            <dim>1</dim>
                            <dim>3</dim>
                            <dim>22</dim>
                            <dim>22</dim>
                        </port>
                    </input>
                </layer>
            </layers>
            <edges>
                <edge from-layer="0" from-port="0" to-layer="1" to-port="1"/>
                <edge from-layer="1" from-port="2" to-layer="2" to-port="0"/>
            </edges>
        </net>
        )V0G0N";
    if (target_device.find("GNA") != std::string::npos) {
        GTEST_SKIP();
    }
    EXPECT_NO_THROW(auto execNet = core->compile_model(model, ov::Tensor(), target_device, configuration));
 }
 TEST(OVExecutableNetworkBaseTest, smoke_LoadNetworkToDefaultDeviceNoThrow) {
    SKIP_IF_CURRENT_TEST_IS_DISABLED()
    std::shared_ptr<ov::Core> core = utils::PluginCache::get().core();
--- a/src/tests/functional/plugin/shared/include/behavior/ov_plugin/caching_tests.hpp
+++ b/src/tests/functional/plugin/shared/include/behavior/ov_plugin/caching_tests.hpp
@ -82,6 +82,27 @@ public:
    void run() override;
 };
 using compileModelLoadFromMemoryParams = std::tuple<std::string,  // device name
                                                    ov::AnyMap    // device configuration
 >;
 class CompileModelLoadFromMemoryTestBase : public testing::WithParamInterface<compileModelLoadFromMemoryParams>,
                                           virtual public SubgraphBaseTest,
                                           virtual public OVPluginTestBase {
    std::string m_cacheFolderName;
    std::string m_modelName;
    std::string m_weightsName;
    std::string m_model;
    ov::Tensor m_weights;
    std::vector<std::uint8_t> weights_vector;
 public:
    static std::string getTestCaseName(testing::TestParamInfo<compileModelLoadFromMemoryParams> obj);
    void SetUp() override;
    void TearDown() override;
    void run() override;
 };
 using compileKernelsCacheParams = std::tuple<
        std::string,                          // device name
        std::pair<ov::AnyMap, std::string>    // device and cache configuration
--- a/src/tests/functional/plugin/shared/src/behavior/ov_plugin/caching_tests.cpp
+++ b/src/tests/functional/plugin/shared/src/behavior/ov_plugin/caching_tests.cpp
@ -321,6 +321,98 @@ TEST_P(CompileModelLoadFromFileTestBase, CanLoadFromFileWithoutExecption) {
    run();
 }
 std::string CompileModelLoadFromMemoryTestBase::getTestCaseName(
    testing::TestParamInfo<compileModelLoadFromMemoryParams> obj) {
    auto param = obj.param;
    auto deviceName = std::get<0>(param);
    auto configuration = std::get<1>(param);
    std::ostringstream result;
    std::replace(deviceName.begin(), deviceName.end(), ':', '.');
    result << "device_name=" << deviceName << "_";
    for (auto& iter : configuration) {
        result << "_" << iter.first << "_" << iter.second.as<std::string>() << "_";
    }
    return result.str();
 }
 void CompileModelLoadFromMemoryTestBase::SetUp() {
    ovModelWithName funcPair;
    std::tie(targetDevice, configuration) = GetParam();
    target_device = targetDevice;
    APIBaseTest::SetUp();
    std::stringstream ss;
    auto hash = std::hash<std::string>()(SubgraphBaseTest::GetTestName());
    ss << "testCache_" << std::to_string(hash) << "_" << std::this_thread::get_id() << "_" << GetTimestamp();
    m_modelName = ss.str() + ".xml";
    m_weightsName = ss.str() + ".bin";
    for (auto& iter : configuration) {
        ss << "_" << iter.first << "_" << iter.second.as<std::string>() << "_";
    }
    m_cacheFolderName = ss.str();
    core->set_property(ov::cache_dir());
    ngraph::pass::Manager manager;
    manager.register_pass<ov::pass::Serialize>(m_modelName, m_weightsName);
    manager.run_passes(ngraph::builder::subgraph::makeConvPoolRelu(
        {1, 3, 227, 227},
        InferenceEngine::details::convertPrecision(InferenceEngine::Precision::FP32)));
    try {
        std::ifstream model_file(m_modelName, std::ios::binary);
        std::stringstream ss;
        ss << model_file.rdbuf();
        m_model = ss.str();
    } catch (const Exception& ex) {
        GTEST_FAIL() << "Can't read xml file from: " << m_modelName << "\nException [" << ex.what() << "]" << std::endl;
    }
    try {
        std::ifstream weights_file(m_weightsName, std::ios::binary);
        weights_file.unsetf(std::ios::skipws);
        weights_file.seekg(0, std::ios::end);
        const auto weights_size = static_cast<std::size_t>(weights_file.tellg());
        weights_file.seekg(0, std::ios::beg);
        weights_vector.reserve(weights_size);
        weights_vector.insert(weights_vector.begin(),
                              std::istream_iterator<std::uint8_t>(weights_file),
                              std::istream_iterator<std::uint8_t>());
        m_weights = ov::Tensor(ov::element::u8, {1, 1, 1, weights_size}, weights_vector.data());
    } catch (const Exception& ex) {
        GTEST_FAIL() << "Can't read weights file from: " << m_weightsName << "\nException [" << ex.what() << "]"
                     << std::endl;
    }
 }
 void CompileModelLoadFromMemoryTestBase::TearDown() {
    CommonTestUtils::removeFilesWithExt(m_cacheFolderName, "blob");
    CommonTestUtils::removeFilesWithExt(m_cacheFolderName, "cl_cache");
    CommonTestUtils::removeIRFiles(m_modelName, m_weightsName);
    std::remove(m_cacheFolderName.c_str());
    core->set_property(ov::cache_dir());
    APIBaseTest::TearDown();
    weights_vector.clear();
 }
 void CompileModelLoadFromMemoryTestBase::run() {
    SKIP_IF_CURRENT_TEST_IS_DISABLED();
    core->set_property(ov::cache_dir(m_cacheFolderName));
    try {
        compiledModel = core->compile_model(m_model, m_weights, targetDevice, configuration);
        inferRequest = compiledModel.create_infer_request();
        inferRequest.infer();
    } catch (const Exception& ex) {
        GTEST_FAIL() << "Can't loadNetwork with model path " << m_modelName << "\nException [" << ex.what() << "]"
                     << std::endl;
    } catch (...) {
        GTEST_FAIL() << "Can't compile network with model path " << m_modelName << std::endl;
    }
 }
 TEST_P(CompileModelLoadFromMemoryTestBase, CanLoadFromMemoryWithoutExecption) {
    run();
 }
 std::string CompiledKernelsCacheTest::getTestCaseName(testing::TestParamInfo<compileKernelsCacheParams> obj) {
    auto param = obj.param;
    std::string deviceName;
--- a/src/tests/ie_test_utils/unit_test_utils/mocks/cpp_interfaces/interface/mock_icore.hpp
+++ b/src/tests/ie_test_utils/unit_test_utils/mocks/cpp_interfaces/interface/mock_icore.hpp
@ -21,6 +21,13 @@ public:
            const std::string &,
            const std::map<std::string, std::string> &,
            const std::function<void(const InferenceEngine::CNNNetwork&)> &));
    MOCK_METHOD5(
        LoadNetwork,
        InferenceEngine::SoExecutableNetworkInternal(const std::string&,
                                                     const InferenceEngine::Blob::CPtr&,
                                                     const std::string&,
                                                     const std::map<std::string, std::string>&,
                                                     const std::function<void(const InferenceEngine::CNNNetwork&)>&));
    MOCK_METHOD3(ImportNetwork, InferenceEngine::SoExecutableNetworkInternal(
        std::istream&, const std::string&, const std::map<std::string, std::string>&));