experimenting with DG1 on the batch size selection, also collecting the mem footprint
This commit is contained in:
parent
e7b743ac33
commit
ac21d71321
@ -681,13 +681,11 @@ Parameter clDNNEngine::GetMetric(const std::string& name, const std::map<std::st
|
||||
} else if (name == METRIC_KEY(OPTIMAL_BATCH)) {
|
||||
auto network = options.find("MODEL_ADDRESS")->second.as<InferenceEngine::CNNNetwork const*>();
|
||||
auto networkCloned = CloneAndTransformNetwork(*network, _impl->m_config);
|
||||
// i7_1185G7
|
||||
const float L2_cache_size = 6*1024*1024;
|
||||
// DG1
|
||||
const float L3_cache_size = 12*1024*1024;
|
||||
unsigned int batch = 1;
|
||||
ov::MemBandwidthPressure memPressure = ov::MemBandwidthPressureTolerance(
|
||||
networkCloned.getFunction(),
|
||||
L2_cache_size, L3_cache_size);
|
||||
networkCloned.getFunction(), L3_cache_size);
|
||||
if (memPressure.max_mem_tolerance > 8*ov::MemBandwidthPressure::LIMITED) {
|
||||
batch = 32;
|
||||
} else if (memPressure.max_mem_tolerance > 4*ov::MemBandwidthPressure::LIMITED) {
|
||||
|
@ -1150,11 +1150,12 @@ ExecutableNetwork Core::LoadNetwork(const CNNNetwork& network,
|
||||
const std::string& deviceNameOrig,
|
||||
const std::map<std::string, std::string>& config) {
|
||||
auto deviceName = deviceNameOrig;
|
||||
if (deviceNameOrig == "GPU") {
|
||||
if (deviceNameOrig.find("GPU") != std::string::npos) {
|
||||
std::map<std::string, Parameter> options;
|
||||
options["MODEL_ADDRESS"] = &network;
|
||||
auto optimalBatchSize =
|
||||
_impl->GetCPPPluginByName(deviceNameOrig).get_metric(METRIC_KEY(OPTIMAL_BATCH), options).as<unsigned int>();
|
||||
_impl->GetCPPPluginByName(DeviceIDParser(deviceName).getDeviceName()).
|
||||
get_metric(METRIC_KEY(OPTIMAL_BATCH), options).as<unsigned int>();
|
||||
auto function = network.getFunction();
|
||||
bool bDetectionOutput = false;
|
||||
for (auto&& node : function->get_ops()) {
|
||||
|
@ -500,11 +500,9 @@ Engine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network, const std
|
||||
// the more "capable" the CPU in general, the more streams we may want to keep to keep it utilized
|
||||
const float memThresholdAssumeLimitedForISA = ov::MemBandwidthPressure::LIMITED/isaSpecificThreshold;
|
||||
const float L2_cache_size = mkldnn::utils::get_cache_size(2 /*level*/, true /*per core */);
|
||||
const float L3_cache_size = mkldnn::utils::get_cache_size(3, false);
|
||||
ov::MemBandwidthPressure networkToleranceForLowCache = ov::MemBandwidthPressureTolerance(
|
||||
clonedNetwork.getFunction(),
|
||||
L2_cache_size, L3_cache_size,
|
||||
memThresholdAssumeLimitedForISA);
|
||||
L2_cache_size, memThresholdAssumeLimitedForISA);
|
||||
// num of phys CPU cores (most aggressive value for #streams)
|
||||
const auto num_cores = getNumberOfCPUCores();
|
||||
// less aggressive
|
||||
|
@ -23,13 +23,12 @@ struct MemBandwidthPressure {
|
||||
|
||||
MemBandwidthPressure MemBandwidthPressureTolerance(
|
||||
const std::shared_ptr<ngraph::Function> nGraphFunc,
|
||||
const float L2_cache_size,
|
||||
const float L3_cache_size,
|
||||
const float cache_size,
|
||||
const float memThresholdAssumeLimited = MemBandwidthPressure::LIMITED) {
|
||||
int total_convs = 0, mem_limited_convs = 0, compute_convs = 0, total_gemms = 0, mem_limited_gemms = 0,
|
||||
total_deconvs = 0, compute_deconvs = 0, mem_limited_deconvs = 0;
|
||||
auto memLimitedFactor = [&](int size_data_moved, int datatype_size = 4) -> float {
|
||||
return (L2_cache_size * 1.0f /*util factor, tbd */
|
||||
return (cache_size * 1.0f /*util factor, tbd */
|
||||
/ (size_data_moved * datatype_size));
|
||||
};
|
||||
auto isLowPrecision = [&](ngraph::element::Type type) -> bool {
|
||||
|
Loading…
Reference in New Issue
Block a user