debugging DG1 perf drop (presumably due to non-fitting the device-mem)
This commit is contained in:
parent
5834de7f67
commit
b52768c2cc
@ -303,7 +303,7 @@ InferenceEngine::IInferRequestInternal::Ptr AutoBatchExecutableNetwork::CreateIn
|
||||
t.first->_inferRequest->CopyInputsIfNeeded();
|
||||
}
|
||||
workerRequestPtr->_inferRequest->StartAsync();
|
||||
std::cout << "BATCH" << std::endl;
|
||||
// std::cout << "BATCH" << std::endl;
|
||||
} else if ((status == std::cv_status::timeout) && sz) {
|
||||
// timeout to collect the batch is over, have to execute the requests in the batch1 mode
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
@ -549,15 +549,27 @@ InferenceEngine::IExecutableNetworkInternal::Ptr AutoBatchInferencePlugin::LoadN
|
||||
const auto perfConfig = fullConfig.find(PluginConfigParams::KEY_PERF_COUNT);
|
||||
const bool enablePerfCounters = (fullConfig.end() != perfConfig) && (perfConfig->second == PluginConfigParams::YES);
|
||||
|
||||
auto report_footprint = [] (std::shared_ptr<ICore> pCore, std::string device, std::string message) -> size_t {
|
||||
size_t footprint = 0;
|
||||
const auto stats = pCore->GetMetric(device, GPU_METRIC_KEY(MEMORY_STATISTICS)).as<std::map<std::string, uint64_t>>();
|
||||
for (auto s : stats)
|
||||
footprint += s.second;
|
||||
std::cout << "!!!!!!!!!!!!!! (FOOTPRINT) " << message << " : " << footprint/1024 << " MB" << std::endl;
|
||||
return footprint;
|
||||
};
|
||||
|
||||
if (deviceName.find("GPU") != std::string::npos)
|
||||
report_footprint(GetCore(), deviceName, "Before Batch1");
|
||||
auto executableNetworkWithoutBatch = ctx
|
||||
? GetCore()->LoadNetwork(network, ctx, deviceConfig)
|
||||
: GetCore()->LoadNetwork(network, deviceName, deviceConfig);
|
||||
if (deviceName.find("GPU") != std::string::npos)
|
||||
report_footprint(GetCore(), deviceName, "After Batch1");
|
||||
// device settings + auto-batch settings
|
||||
std::unordered_map<std::string, InferenceEngine::Parameter> networkConfig;
|
||||
networkConfig.insert(*device_batch);
|
||||
networkConfig.insert(deviceConfig.begin(), deviceConfig.end());
|
||||
|
||||
// TODO: remove this experimental code that does loop rather than use the batch1 footprint only
|
||||
InferenceEngine::SoExecutableNetworkInternal executableNetworkWithBatch;
|
||||
do {
|
||||
try {
|
||||
@ -582,6 +594,14 @@ InferenceEngine::IExecutableNetworkInternal::Ptr AutoBatchInferencePlugin::LoadN
|
||||
executableNetworkWithBatch = ctx
|
||||
? GetCore()->LoadNetwork(CNNNetwork{clonedNetwork}, ctx, deviceConfig)
|
||||
: GetCore()->LoadNetwork(CNNNetwork{clonedNetwork}, deviceName, deviceConfig);
|
||||
if (deviceName.find("GPU") != std::string::npos) {
|
||||
const uint64_t total_mem = GetCore()->GetMetric(deviceName, GPU_METRIC_KEY(DEVICE_TOTAL_MEM_SIZE));
|
||||
const size_t footprint = report_footprint(GetCore(), deviceName, "After BATCHED");
|
||||
if (footprint > total_mem) { // WA for inaccurate footprint estimations
|
||||
std::cout << "!!!! Total on-device mem is " << total_mem << " less than :" << footprint << std::endl;
|
||||
throw NETWORK_NOT_LOADED;
|
||||
}
|
||||
}
|
||||
} catch (...) {
|
||||
// reload the network with smaller batch
|
||||
executableNetworkWithBatch = {nullptr, nullptr};
|
||||
|
@ -250,7 +250,7 @@ void Config::UpdateFromMap(const std::map<std::string, std::string>& configMap)
|
||||
}
|
||||
} else if (key.compare(PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS) == 0) {
|
||||
if (val.compare(PluginConfigParams::GPU_THROUGHPUT_AUTO) == 0) {
|
||||
throughput_streams = 2;
|
||||
throughput_streams = default_num_streams_for_tput;
|
||||
} else {
|
||||
int val_i;
|
||||
try {
|
||||
|
@ -14,6 +14,9 @@
|
||||
|
||||
namespace CLDNNPlugin {
|
||||
|
||||
// fixme: this value should be deduced from the #command-streamers, and presumably queried from the plugin
|
||||
const uint32_t default_num_streams_for_tput = 2;
|
||||
|
||||
struct Config {
|
||||
Config(std::string device_id = "0") : device_id(device_id),
|
||||
throughput_streams(1),
|
||||
|
@ -726,6 +726,7 @@ Parameter clDNNEngine::GetMetric(const std::string& name, const std::map<std::st
|
||||
std::cout << "SELECTED BATCH: " << batch << std::endl;
|
||||
std::map<std::string, InferenceEngine::Parameter> options_for_max_batch;
|
||||
options_for_max_batch["MODEL_PTR"] = std::const_pointer_cast<ngraph::Function>(network->getFunction());
|
||||
options_for_max_batch["GPU_THROUGHPUT_STREAMS"] = static_cast<uint32_t>(default_num_streams_for_tput);
|
||||
auto max_batch_size = GetMetric(GPU_METRIC_KEY(MAX_BATCH_SIZE), options_for_max_batch).as<unsigned int>();
|
||||
std::cout << "MAX_BATCH: " << max_batch_size << std::endl;
|
||||
unsigned int closest = pow(2, floor(log(max_batch_size)/log(2)));
|
||||
|
Loading…
Reference in New Issue
Block a user