[GPU] Fallback to kernel caching in the case of dynamic models (#15842)
* use kernel caching for dynamic models * replaced cl_cache with blob * updated to serialize dims info of input and output * updated to skip unicode tests in Windows
This commit is contained in:
@@ -46,9 +46,6 @@ public:
|
||||
ExecutionConfig m_config;
|
||||
InferenceEngine::ITaskExecutor::Ptr m_taskExecutor;
|
||||
InferenceEngine::ITaskExecutor::Ptr m_waitExecutor;
|
||||
|
||||
private:
|
||||
bool is_serializable();
|
||||
};
|
||||
|
||||
} // namespace intel_gpu
|
||||
|
||||
@@ -18,7 +18,7 @@ namespace intel_gpu {
|
||||
class Plugin : public InferenceEngine::IInferencePlugin {
|
||||
struct impl;
|
||||
std::shared_ptr<impl> _impl;
|
||||
bool isModelCachingEnabled = false;
|
||||
bool isModelCachingEnabled = true;
|
||||
|
||||
std::string default_device_id = "0";
|
||||
// key: device_id, value: cldnn device
|
||||
|
||||
@@ -324,10 +324,8 @@ private:
|
||||
void build_primitive(const ExecutionConfig& config) {
|
||||
auto cache_outpath = get_cache_directory(config);
|
||||
|
||||
if (const char* env_p = std::getenv("OV_GPU_CACHE_MODEL")) {
|
||||
if (env_p[0] == '1') {
|
||||
cache_outpath = "";
|
||||
}
|
||||
if (!config.get_property(ov::intel_gpu::allow_new_shape_infer)) {
|
||||
cache_outpath = "";
|
||||
}
|
||||
|
||||
if (cache_outpath.empty()) {
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
#include "intel_gpu/graph/serialization/binary_buffer.hpp"
|
||||
#include "intel_gpu/graph/serialization/string_serializer.hpp"
|
||||
#include "intel_gpu/graph/serialization/utils.hpp"
|
||||
#include "intel_gpu/graph/serialization/vector_serializer.hpp"
|
||||
#include "intel_gpu/plugin/graph.hpp"
|
||||
#include "intel_gpu/runtime/itt.hpp"
|
||||
#include "intel_gpu/plugin/infer_request.hpp"
|
||||
@@ -96,11 +97,14 @@ CompiledModel::CompiledModel(std::istream& networkModel, InferenceEngine::Remote
|
||||
std::string name;
|
||||
std::string precision;
|
||||
std::string layout;
|
||||
InferenceEngine::SizeVector dims;
|
||||
ib >> name;
|
||||
ib >> precision;
|
||||
ib >> layout;
|
||||
ib >> dims;
|
||||
|
||||
DataPtr input = std::make_shared<Data>(name, Precision::FromStr(precision), cldnn::serial_util::layout_from_string(layout));
|
||||
input->setDims(dims);
|
||||
InputInfo::Ptr infoNew = std::make_shared<InputInfo>();
|
||||
infoNew->setInputData(input);
|
||||
inputs.emplace(std::make_pair(name, infoNew));
|
||||
@@ -115,11 +119,14 @@ CompiledModel::CompiledModel(std::istream& networkModel, InferenceEngine::Remote
|
||||
std::string name;
|
||||
std::string precision;
|
||||
std::string layout;
|
||||
InferenceEngine::SizeVector dims;
|
||||
ib >> name;
|
||||
ib >> precision;
|
||||
ib >> layout;
|
||||
ib >> dims;
|
||||
|
||||
DataPtr output = std::make_shared<Data>(name, Precision::FromStr(precision), cldnn::serial_util::layout_from_string(layout));
|
||||
output->setDims(dims);
|
||||
outputs.emplace(std::make_pair(name, output));
|
||||
}
|
||||
|
||||
@@ -317,14 +324,6 @@ IInferRequestInternal::Ptr CompiledModel::CreateInferRequest() {
|
||||
_callbackExecutor);
|
||||
}
|
||||
|
||||
bool CompiledModel::is_serializable() {
|
||||
// Dynamic model serialization is not yet supported.
|
||||
if (m_graphs[0]->GetNetwork()->is_dynamic())
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Cache blob format:
|
||||
// [ ConstInputsDataMap / ConstOutputsDataMap ]
|
||||
// [ ov::Node::Input/ ov::Node::Output ]
|
||||
@@ -334,9 +333,6 @@ void CompiledModel::Export(std::ostream& networkModel) {
|
||||
if (m_graphs.empty())
|
||||
IE_THROW(NetworkNotLoaded);
|
||||
|
||||
if (!is_serializable())
|
||||
return;
|
||||
|
||||
cldnn::BinaryOutputBuffer ob(networkModel);
|
||||
|
||||
// InputsInfo and OutputsInfo for CNNNetwork
|
||||
@@ -350,6 +346,7 @@ void CompiledModel::Export(std::ostream& networkModel) {
|
||||
std::stringstream ss;
|
||||
ss << in.second->getInputData()->getLayout();
|
||||
ob << ss.str();
|
||||
ob << in.second->getTensorDesc().getDims();
|
||||
}
|
||||
|
||||
ob << GetOutputsInfo().size();
|
||||
@@ -361,6 +358,7 @@ void CompiledModel::Export(std::ostream& networkModel) {
|
||||
std::stringstream ss;
|
||||
ss << out.second->getLayout();
|
||||
ob << ss.str();
|
||||
ob << out.second->getTensorDesc().getDims();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -142,12 +142,6 @@ Plugin::Plugin() : m_default_contexts({}) {
|
||||
m_default_contexts.insert({device.first, ctx});
|
||||
}
|
||||
}
|
||||
|
||||
if (const char* env_p = std::getenv("OV_GPU_CACHE_MODEL")) {
|
||||
if (env_p[0] == '1') {
|
||||
isModelCachingEnabled = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto check_inputs = [](InferenceEngine::InputsDataMap _networkInputs) {
|
||||
@@ -204,6 +198,9 @@ IExecutableNetworkInternal::Ptr Plugin::LoadExeNetworkImpl(const InferenceEngine
|
||||
{
|
||||
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Plugin::LoadExeNetworkImpl::CreateExeNetwork");
|
||||
CompiledModel::Ptr exeNetwork = std::make_shared<CompiledModel>(transformedNetwork, context, config);
|
||||
if (exeNetwork->m_graphs[0]->GetNetwork()->is_dynamic()) {
|
||||
isModelCachingEnabled = false;
|
||||
}
|
||||
update_memory_statistics(context->get_impl());
|
||||
return exeNetwork;
|
||||
}
|
||||
|
||||
@@ -70,10 +70,8 @@ std::string kernels_cache::get_cache_path() const {
|
||||
}
|
||||
|
||||
bool kernels_cache::is_cache_enabled() const {
|
||||
if (const char* env_p = std::getenv("OV_GPU_CACHE_MODEL")) {
|
||||
if (env_p[0] == '1') {
|
||||
return false;
|
||||
}
|
||||
if (!_config.get_property(ov::intel_gpu::allow_new_shape_infer)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return !_config.get_property(ov::cache_dir).empty();
|
||||
|
||||
Reference in New Issue
Block a user