[GPU] Fallback to kernel caching in the case of dynamic models (#15842)

* use kernel caching for dynamic models * replaced cl_cache with blob * updated to serialize dims info of input and output * updated to skip unicode tests in Windows
2023-02-24 15:05:16 +09:00
parent a4f0b340d0
commit f562e96305
9 changed files with 24 additions and 36 deletions
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/compiled_model.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/compiled_model.hpp
@@ -46,9 +46,6 @@ public:
    ExecutionConfig m_config;
    InferenceEngine::ITaskExecutor::Ptr m_taskExecutor;
    InferenceEngine::ITaskExecutor::Ptr m_waitExecutor;
-
-private:
-    bool is_serializable();
 };

 }  // namespace intel_gpu
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp
@@ -18,7 +18,7 @@ namespace intel_gpu {
 class Plugin : public InferenceEngine::IInferencePlugin {
    struct impl;
    std::shared_ptr<impl> _impl;
-    bool isModelCachingEnabled = false;
+    bool isModelCachingEnabled = true;

    std::string default_device_id = "0";
    // key: device_id, value: cldnn device
--- a/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h
@@ -324,10 +324,8 @@ private:
    void build_primitive(const ExecutionConfig& config) {
        auto cache_outpath = get_cache_directory(config);

-        if (const char* env_p = std::getenv("OV_GPU_CACHE_MODEL")) {
-            if (env_p[0] == '1') {
-                cache_outpath = "";
-            }
+        if (!config.get_property(ov::intel_gpu::allow_new_shape_infer)) {
+            cache_outpath = "";
        }

        if (cache_outpath.empty()) {
--- a/src/plugins/intel_gpu/src/plugin/compiled_model.cpp
+++ b/src/plugins/intel_gpu/src/plugin/compiled_model.cpp
@@ -6,6 +6,7 @@
 #include "intel_gpu/graph/serialization/binary_buffer.hpp"
 #include "intel_gpu/graph/serialization/string_serializer.hpp"
 #include "intel_gpu/graph/serialization/utils.hpp"
+#include "intel_gpu/graph/serialization/vector_serializer.hpp"
 #include "intel_gpu/plugin/graph.hpp"
 #include "intel_gpu/runtime/itt.hpp"
 #include "intel_gpu/plugin/infer_request.hpp"
@@ -96,11 +97,14 @@ CompiledModel::CompiledModel(std::istream& networkModel, InferenceEngine::Remote
            std::string name;
            std::string precision;
            std::string layout;
+            InferenceEngine::SizeVector dims;
            ib >> name;
            ib >> precision;
            ib >> layout;
+            ib >> dims;

            DataPtr input = std::make_shared<Data>(name, Precision::FromStr(precision), cldnn::serial_util::layout_from_string(layout));
+            input->setDims(dims);
            InputInfo::Ptr infoNew = std::make_shared<InputInfo>();
            infoNew->setInputData(input);
            inputs.emplace(std::make_pair(name, infoNew));
@@ -115,11 +119,14 @@ CompiledModel::CompiledModel(std::istream& networkModel, InferenceEngine::Remote
            std::string name;
            std::string precision;
            std::string layout;
+            InferenceEngine::SizeVector dims;
            ib >> name;
            ib >> precision;
            ib >> layout;
+            ib >> dims;

            DataPtr output = std::make_shared<Data>(name, Precision::FromStr(precision), cldnn::serial_util::layout_from_string(layout));
+            output->setDims(dims);
            outputs.emplace(std::make_pair(name, output));
        }

@@ -317,14 +324,6 @@ IInferRequestInternal::Ptr CompiledModel::CreateInferRequest() {
                                               _callbackExecutor);
 }

-bool CompiledModel::is_serializable() {
-    // Dynamic model serialization is not yet supported.
-    if (m_graphs[0]->GetNetwork()->is_dynamic())
-        return false;
-
-    return true;
-}
-
 // Cache blob format:
 //     [ ConstInputsDataMap / ConstOutputsDataMap ]
 //     [ ov::Node::Input/ ov::Node::Output ]
@@ -334,9 +333,6 @@ void CompiledModel::Export(std::ostream& networkModel) {
    if (m_graphs.empty())
        IE_THROW(NetworkNotLoaded);

-    if (!is_serializable())
-        return;
-
    cldnn::BinaryOutputBuffer ob(networkModel);

    // InputsInfo and OutputsInfo for CNNNetwork
@@ -350,6 +346,7 @@ void CompiledModel::Export(std::ostream& networkModel) {
            std::stringstream ss;
            ss << in.second->getInputData()->getLayout();
            ob << ss.str();
+            ob << in.second->getTensorDesc().getDims();
        }

        ob << GetOutputsInfo().size();
@@ -361,6 +358,7 @@ void CompiledModel::Export(std::ostream& networkModel) {
            std::stringstream ss;
            ss << out.second->getLayout();
            ob << ss.str();
+            ob << out.second->getTensorDesc().getDims();
        }
    }

--- a/src/plugins/intel_gpu/src/plugin/plugin.cpp
+++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp
@@ -142,12 +142,6 @@ Plugin::Plugin() : m_default_contexts({}) {
            m_default_contexts.insert({device.first, ctx});
        }
    }
-
-    if (const char* env_p = std::getenv("OV_GPU_CACHE_MODEL")) {
-        if (env_p[0] == '1') {
-            isModelCachingEnabled = true;
-        }
-    }
 }

 auto check_inputs = [](InferenceEngine::InputsDataMap _networkInputs) {
@@ -204,6 +198,9 @@ IExecutableNetworkInternal::Ptr Plugin::LoadExeNetworkImpl(const InferenceEngine
    {
        OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Plugin::LoadExeNetworkImpl::CreateExeNetwork");
        CompiledModel::Ptr exeNetwork = std::make_shared<CompiledModel>(transformedNetwork, context, config);
+        if (exeNetwork->m_graphs[0]->GetNetwork()->is_dynamic()) {
+            isModelCachingEnabled = false;
+        }
        update_memory_statistics(context->get_impl());
        return exeNetwork;
    }
--- a/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp
+++ b/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp
@@ -70,10 +70,8 @@ std::string kernels_cache::get_cache_path() const {
 }

 bool kernels_cache::is_cache_enabled() const {
-    if (const char* env_p = std::getenv("OV_GPU_CACHE_MODEL")) {
-        if (env_p[0] == '1') {
-            return false;
-        }
+    if (!_config.get_property(ov::intel_gpu::allow_new_shape_infer)) {
+        return false;
    }

    return !_config.get_property(ov::cache_dir).empty();