feat(load-balancer): improve migration (perf mode) regarding memory and cpu usage

- ensure we optimize CPU first instead of free memory - use low threshold now to forbid bad migration based on cpu usage - add a tolerance on the VM CPU usage to migrate VM with the most memory used - do not migrate if we create an unbalanced configuration (only if high tresholds are not reached) - change factors to take into account the new algorithm
2021-04-14 17:41:40 +02:00
parent 8ae432554e
commit 4008934bbb
3 changed files with 105 additions and 58 deletions
--- a/CHANGELOG.unreleased.md
+++ b/CHANGELOG.unreleased.md
@@ -15,6 +15,7 @@
 - [XOA] Notify user when proxies need to be upgraded (PR [#5717](https://github.com/vatesfr/xen-orchestra/pull/5717))
 - [Host/network] Identify the management network [#5731](https://github.com/vatesfr/xen-orchestra/issues/5731) (PR [#5743](https://github.com/vatesfr/xen-orchestra/pull/5743))
 - [Backup/S3] Support for HTTP protocol and choice of region (PR [#5658](https://github.com/vatesfr/xen-orchestra/pull/5658))
+- [Host/Load-balancer] Improve migration (perf mode) regarding memory and cpu usage (PR [#5734](https://github.com/vatesfr/xen-orchestra/pull/5734))

 ### Bug fixes

--- a/packages/xo-server-load-balancer/src/performance-plan.js
+++ b/packages/xo-server-load-balancer/src/performance-plan.js
@@ -1,19 +1,12 @@
-import { filter, find } from 'lodash'
+import { filter } from 'lodash'

 import Plan from './plan'
 import { debug } from './utils'

-// Compare a list of objects and give the best.
-function searchBestObject(objects, fun) {
-  let object = objects[0]
-
-  for (let i = 1; i < objects.length; i++) {
-    if (fun(object, objects[i]) > 0) {
-      object = objects[i]
-    }
-  }
-
-  return object
+function epsiEqual(a, b, epsi = 0.001) {
+  const absA = Math.abs(a)
+  const absB = Math.abs(b)
+  return Math.abs(a - b) <= Math.min(absA, absB) * epsi || (absA <= epsi && absB <= epsi)
 }

 // ===================================================================
@@ -55,14 +48,7 @@ export default class PerformancePlan extends Plan {
    }

    const { averages, toOptimize } = results
-
-    toOptimize.sort((a, b) => {
-      a = averages[a.id]
-      b = averages[b.id]
-
-      return b.cpu - a.cpu || a.memoryFree - b.memoryFree
-    })
-
+    toOptimize.sort((a, b) => -this._sortHosts(a, b))
    for (const exceededHost of toOptimize) {
      const { id } = exceededHost

@@ -79,51 +65,68 @@ export default class PerformancePlan extends Plan {
    }
  }

+  _getThresholdState(averages) {
+    return {
+      cpu: averages.cpu >= this._thresholds.cpu.high,
+      mem: averages.memoryFree <= this._thresholds.memoryFree.high,
+    }
+  }
+
+  _sortHosts(aAverages, bAverages) {
+    const aState = this._getThresholdState(aAverages)
+    const bState = this._getThresholdState(bAverages)
+
+    // A. Same state.
+    if (aState.mem === bState.mem && aState.cpu === bState.cpu) {
+      if (epsiEqual(aAverages.cpu, bAverages.cpu)) {
+        return bAverages.memoryFree - aAverages.memoryFree
+      }
+      return aAverages.cpu - bAverages.cpu
+    }
+
+    // B. No limit reached on A OR both limits reached on B.
+    if ((!aState.mem && !aState.cpu) || (bState.mem && bState.cpu)) {
+      return -1
+    }
+
+    // C. No limit reached on B OR both limits reached on A.
+    if ((!bState.mem && !bState.cpu) || (aState.mem && aState.cpu)) {
+      return 1
+    }
+
+    // D. If only one limit is reached on A AND B, we prefer to migrate on the host with the lowest CPU usage.
+    return !aState.cpu ? -1 : 1
+  }
+
  async _optimize({ exceededHost, hosts, hostsAverages }) {
    const vms = filter(this._getAllRunningVms(), vm => vm.$container === exceededHost.id)
    const vmsAverages = await this._getVmsAverages(vms, { [exceededHost.id]: exceededHost })

-    // Sort vms by cpu usage. (lower to higher)
-    vms.sort((a, b) => vmsAverages[b.id].cpu - vmsAverages[a.id].cpu)
+    // Sort vms by cpu usage. (higher to lower) + use memory otherwise.
+    vms.sort((a, b) => {
+      const aAverages = vmsAverages[a.id]
+      const bAverages = vmsAverages[b.id]
+
+      // We use a tolerance to migrate VM with the most memory used.
+      if (epsiEqual(aAverages.cpu, bAverages.cpu, 3)) {
+        return bAverages.memory - aAverages.memory
+      }
+      return bAverages.cpu - aAverages.cpu
+    })

    const exceededAverages = hostsAverages[exceededHost.id]
    const promises = []

    const xapiSrc = this.xo.getXapi(exceededHost)
-    let optimizationsCount = 0
-
-    const searchFunction = (a, b) => hostsAverages[b.id].cpu - hostsAverages[a.id].cpu
+    let optimizationCount = 0

    for (const vm of vms) {
-      debug(`Trying to migrate ${vm.id}...`)
-
-      // Search host with lower cpu usage in the same pool first. In other pool if necessary.
-      let destination = searchBestObject(
-        find(hosts, host => host.$poolId === vm.$poolId),
-        searchFunction
-      )
-
-      if (!destination) {
-        debug('No destination host found in the current VM pool. Trying in all pools.')
-        destination = searchBestObject(hosts, searchFunction)
-      }
-
-      const destinationAverages = hostsAverages[destination.id]
-      const vmAverages = vmsAverages[vm.id]
-
-      debug(`Trying to migrate VM (${vm.id}) to Host (${destination.id}) from Host (${exceededHost.id})...`)
-
-      // Unable to move the vm.
+      // Stop migration if we are below low threshold.
      if (
-        exceededAverages.cpu - vmAverages.cpu < destinationAverages.cpu + vmAverages.cpu ||
-        destinationAverages.memoryFree < vmAverages.memory
+        exceededAverages.cpu <= this._thresholds.cpu.low &&
+        exceededAverages.memoryFree >= this._thresholds.memoryFree.low
      ) {
-        debug(`Cannot migrate VM (${vm.id}) to Host (${destination.id}).`)
-        debug(
-          `Src Host CPU=${exceededAverages.cpu}, Dest Host CPU=${destinationAverages.cpu}, VM CPU=${vmAverages.cpu}`
-        )
-        debug(`Dest Host free RAM=${destinationAverages.memoryFree}, VM used RAM=${vmAverages.memory})`)
-        continue
+        return
      }

      if (!vm.xenTools) {
@@ -145,6 +148,49 @@ export default class PerformancePlan extends Plan {
        }
      }

+      hosts.sort((a, b) => {
+        if (a.$poolId !== b.$poolId) {
+          // Use host in the same pool first. In other pool if necessary.
+          if (a.$poolId === vm.$poolId) {
+            return -1
+          }
+          if (b.$poolId === vm.$poolId) {
+            return 1
+          }
+        }
+
+        return this._sortHosts(hostsAverages[a.id], hostsAverages[b.id])
+      })
+
+      const destination = hosts[0]
+
+      const destinationAverages = hostsAverages[destination.id]
+      const vmAverages = vmsAverages[vm.id]
+
+      // Unable to move the vm.
+      // Because the performance mode is focused on the CPU usage, we can't migrate if the low threshold
+      // is reached on the destination.
+      // It's not the same idea regarding the memory usage, we can migrate if the low threshold is reached,
+      // but we avoid the migration in the critical (high) threshold case.
+      const state = this._getThresholdState(exceededAverages)
+      if (
+        destinationAverages.cpu + vmAverages.cpu >= this._thresholds.cpu.low ||
+        destinationAverages.memoryFree - vmAverages.memory <= this._thresholds.cpu.high ||
+        (!state.cpu &&
+          !state.memory &&
+          (exceededAverages.cpu - vmAverages.cpu < destinationAverages.cpu + vmAverages.cpu ||
+            exceededAverages.memoryFree + vmAverages.memory > destinationAverages.memoryFree - vmAverages.memory))
+      ) {
+        debug(`Cannot migrate VM (${vm.id}) to Host (${destination.id}).`)
+        debug(
+          `Src Host CPU=${exceededAverages.cpu}, Dest Host CPU=${destinationAverages.cpu}, VM CPU=${vmAverages.cpu}`
+        )
+        debug(
+          `Src Host free RAM=${exceededAverages.memoryFree}, Dest Host free RAM=${destinationAverages.memoryFree}, VM used RAM=${vmAverages.memory})`
+        )
+        continue
+      }
+
      exceededAverages.cpu -= vmAverages.cpu
      destinationAverages.cpu += vmAverages.cpu

@@ -152,12 +198,12 @@ export default class PerformancePlan extends Plan {
      destinationAverages.memoryFree -= vmAverages.memory

      debug(`Migrate VM (${vm.id}) to Host (${destination.id}) from Host (${exceededHost.id}).`)
-      optimizationsCount++
+      optimizationCount++

      promises.push(xapiSrc.migrateVm(vm._xapiId, this.xo.getXapi(destination), destination._xapiId))
    }

    await Promise.all(promises)
-    debug(`Performance mode: ${optimizationsCount} optimizations for Host (${exceededHost.id}).`)
+    debug(`Performance mode: ${optimizationCount} optimizations for Host (${exceededHost.id}).`)
  }
 }
--- a/packages/xo-server-load-balancer/src/plan.js
+++ b/packages/xo-server-load-balancer/src/plan.js
@@ -9,14 +9,14 @@ const MINUTES_OF_HISTORICAL_DATA = 30
 export const DEFAULT_CRITICAL_THRESHOLD_CPU = 90.0

 // Memory threshold in MB.
-export const DEFAULT_CRITICAL_THRESHOLD_MEMORY_FREE = 64.0
+export const DEFAULT_CRITICAL_THRESHOLD_MEMORY_FREE = 1000.0

 // Thresholds factors.
 const HIGH_THRESHOLD_FACTOR = 0.85
-const LOW_THRESHOLD_FACTOR = 0.25
+const LOW_THRESHOLD_FACTOR = 0.65

-const HIGH_THRESHOLD_MEMORY_FREE_FACTOR = 1.25
-const LOW_THRESHOLD_MEMORY_FREE_FACTOR = 20.0
+const HIGH_THRESHOLD_MEMORY_FREE_FACTOR = 1.2
+const LOW_THRESHOLD_MEMORY_FREE_FACTOR = 1.5

 const numberOrDefault = (value, def) => (value >= 0 ? value : def)