debugging DG1 perf drop (presumably due to non-fitting the device-mem)

This commit is contained in:
myshevts 2021-12-01 18:13:01 +03:00
parent 5834de7f67
commit b52768c2cc
4 changed files with 27 additions and 3 deletions

View File

@ -303,7 +303,7 @@ InferenceEngine::IInferRequestInternal::Ptr AutoBatchExecutableNetwork::CreateIn
t.first->_inferRequest->CopyInputsIfNeeded();
}
workerRequestPtr->_inferRequest->StartAsync();
std::cout << "BATCH" << std::endl;
// std::cout << "BATCH" << std::endl;
} else if ((status == std::cv_status::timeout) && sz) {
// timeout to collect the batch is over, have to execute the requests in the batch1 mode
auto start = std::chrono::high_resolution_clock::now();
@ -549,15 +549,27 @@ InferenceEngine::IExecutableNetworkInternal::Ptr AutoBatchInferencePlugin::LoadN
const auto perfConfig = fullConfig.find(PluginConfigParams::KEY_PERF_COUNT);
const bool enablePerfCounters = (fullConfig.end() != perfConfig) && (perfConfig->second == PluginConfigParams::YES);
auto report_footprint = [] (std::shared_ptr<ICore> pCore, std::string device, std::string message) -> size_t {
size_t footprint = 0;
const auto stats = pCore->GetMetric(device, GPU_METRIC_KEY(MEMORY_STATISTICS)).as<std::map<std::string, uint64_t>>();
for (auto s : stats)
footprint += s.second;
std::cout << "!!!!!!!!!!!!!! (FOOTPRINT) " << message << " : " << footprint/1024 << " MB" << std::endl;
return footprint;
};
if (deviceName.find("GPU") != std::string::npos)
report_footprint(GetCore(), deviceName, "Before Batch1");
auto executableNetworkWithoutBatch = ctx
? GetCore()->LoadNetwork(network, ctx, deviceConfig)
: GetCore()->LoadNetwork(network, deviceName, deviceConfig);
if (deviceName.find("GPU") != std::string::npos)
report_footprint(GetCore(), deviceName, "After Batch1");
// device settings + auto-batch settings
std::unordered_map<std::string, InferenceEngine::Parameter> networkConfig;
networkConfig.insert(*device_batch);
networkConfig.insert(deviceConfig.begin(), deviceConfig.end());
// TODO: remove this experimental code that does loop rather than use the batch1 footprint only
InferenceEngine::SoExecutableNetworkInternal executableNetworkWithBatch;
do {
try {
@ -582,6 +594,14 @@ InferenceEngine::IExecutableNetworkInternal::Ptr AutoBatchInferencePlugin::LoadN
executableNetworkWithBatch = ctx
? GetCore()->LoadNetwork(CNNNetwork{clonedNetwork}, ctx, deviceConfig)
: GetCore()->LoadNetwork(CNNNetwork{clonedNetwork}, deviceName, deviceConfig);
if (deviceName.find("GPU") != std::string::npos) {
const uint64_t total_mem = GetCore()->GetMetric(deviceName, GPU_METRIC_KEY(DEVICE_TOTAL_MEM_SIZE));
const size_t footprint = report_footprint(GetCore(), deviceName, "After BATCHED");
if (footprint > total_mem) { // WA for inaccurate footprint estimations
std::cout << "!!!! Total on-device mem is " << total_mem << " less than :" << footprint << std::endl;
throw NETWORK_NOT_LOADED;
}
}
} catch (...) {
// reload the network with smaller batch
executableNetworkWithBatch = {nullptr, nullptr};

View File

@ -250,7 +250,7 @@ void Config::UpdateFromMap(const std::map<std::string, std::string>& configMap)
}
} else if (key.compare(PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS) == 0) {
if (val.compare(PluginConfigParams::GPU_THROUGHPUT_AUTO) == 0) {
throughput_streams = 2;
throughput_streams = default_num_streams_for_tput;
} else {
int val_i;
try {

View File

@ -14,6 +14,9 @@
namespace CLDNNPlugin {
// fixme: this value should be deduced from the #command-streamers, and presumably queried from the plugin
const uint32_t default_num_streams_for_tput = 2;
struct Config {
Config(std::string device_id = "0") : device_id(device_id),
throughput_streams(1),

View File

@ -726,6 +726,7 @@ Parameter clDNNEngine::GetMetric(const std::string& name, const std::map<std::st
std::cout << "SELECTED BATCH: " << batch << std::endl;
std::map<std::string, InferenceEngine::Parameter> options_for_max_batch;
options_for_max_batch["MODEL_PTR"] = std::const_pointer_cast<ngraph::Function>(network->getFunction());
options_for_max_batch["GPU_THROUGHPUT_STREAMS"] = static_cast<uint32_t>(default_num_streams_for_tput);
auto max_batch_size = GetMetric(GPU_METRIC_KEY(MAX_BATCH_SIZE), options_for_max_batch).as<unsigned int>();
std::cout << "MAX_BATCH: " << max_batch_size << std::endl;
unsigned int closest = pow(2, floor(log(max_batch_size)/log(2)));