feat(load-balancer): add new anti-affinity mode (#5652)

Fixes #5600
2021-03-30 17:25:41 +02:00
parent efdd196441
commit 9ef05b8afe
6 changed files with 365 additions and 40 deletions
--- a/CHANGELOG.unreleased.md
+++ b/CHANGELOG.unreleased.md
@@ -5,6 +5,8 @@

 ### Enhancements

+- [Host/Load-balancer] Add a new anti-affinity mode (PR [#5652](https://github.com/vatesfr/xen-orchestra/pull/5652))
+
 ### Bug fixes

 ### Packages to release
@@ -26,5 +28,6 @@

 - @xen-orchestra/xapi minor
 - @xen-orchestra/backups minor
+- xo-server-load-balancer minor
 - xo-server patch
 - xo-web minor
--- a/packages/xo-server-load-balancer/src/density-plan.js
+++ b/packages/xo-server-load-balancer/src/density-plan.js
@@ -15,13 +15,19 @@ export default class DensityPlan extends Plan {
  }

  async execute() {
-    const results = await this._findHostsToOptimize()
+    await this._processAntiAffinity()
+
+    const hosts = this._getHosts()
+    const results = await this._getHostStatsAverages({
+      hosts,
+      toOptimizeOnly: true,
+    })

    if (!results) {
      return
    }

-    const { hosts, toOptimize } = results
+    const { toOptimize } = results

    let { averages: hostsAverages } = results

@@ -94,14 +100,26 @@ export default class DensityPlan extends Plan {

    debug(`Try to optimize Host (${hostId}).`)

-    const vms = await this._getVms(hostId)
-    const vmsAverages = await this._getVmsAverages(vms, host)
+    const vms = filter(this._getAllRunningVms(), vm => vm.$container === hostId)
+    const vmsAverages = await this._getVmsAverages(vms, { [host.id]: host })

    for (const vm of vms) {
      if (!vm.xenTools) {
        debug(`VM (${vm.id}) of Host (${hostId}) does not support pool migration.`)
        return
      }
+
+      for (const tag of vm.tags) {
+        // TODO: Improve this piece of code. We could compute variance to check if the VM
+        // is migratable. But the code must be rewritten:
+        // - All VMs, hosts and stats must be fetched at one place.
+        // - It's necessary to maintain a dictionary of tags for each host.
+        // - ...
+        if (this._antiAffinityTags.includes(tag)) {
+          debug(`VM (${vm.id}) of Host (${hostId}) cannot be migrated. It contains anti-affinity tag '${tag}'.`)
+          return
+        }
+      }
    }

    // Sort vms by amount of memory. (+ -> -)
--- a/packages/xo-server-load-balancer/src/index.js
+++ b/packages/xo-server-load-balancer/src/index.js
@@ -3,6 +3,7 @@ import { intersection, uniq } from 'lodash'

 import DensityPlan from './density-plan'
 import PerformancePlan from './performance-plan'
+import SimplePlan from './simple-plan'
 import { DEFAULT_CRITICAL_THRESHOLD_CPU, DEFAULT_CRITICAL_THRESHOLD_MEMORY_FREE } from './plan'
 import { EXECUTION_DELAY, debug } from './utils'

@@ -33,7 +34,7 @@ export const configurationSchema = {
          },

          mode: {
-            enum: ['Performance mode', 'Density mode'],
+            enum: ['Performance mode', 'Density mode', 'Simple mode'],
            title: 'Mode',
          },

@@ -75,6 +76,17 @@ export const configurationSchema = {
              $type: 'Host',
            },
          },
+
+          antiAffinityTags: {
+            type: 'array',
+            title: 'Anti-affinity tags',
+            description: 'list of VM tags to force place VMs on different hosts',
+
+            items: {
+              type: 'string',
+              $type: 'Tag',
+            },
+          },
        },

        required: ['name', 'mode', 'pools'],
@@ -130,11 +142,15 @@ class LoadBalancerPlugin {
    }

    this._poolIds = this._poolIds.concat(pools)
-    this._plans.push(
-      mode === PERFORMANCE_MODE
-        ? new PerformancePlan(this.xo, name, pools, options)
-        : new DensityPlan(this.xo, name, pools, options)
-    )
+    let plan
+    if (mode === PERFORMANCE_MODE) {
+      plan = new PerformancePlan(this.xo, name, pools, options)
+    } else if (mode === DENSITY_MODE) {
+      plan = new DensityPlan(this.xo, name, pools, options)
+    } else {
+      plan = new SimplePlan(this.xo, name, pools, options)
+    }
+    this._plans.push(plan)
  }

  _executePlans() {
--- a/packages/xo-server-load-balancer/src/performance-plan.js
+++ b/packages/xo-server-load-balancer/src/performance-plan.js
@@ -42,14 +42,19 @@ export default class PerformancePlan extends Plan {
      console.error(error)
    }

-    const results = await this._findHostsToOptimize()
+    await this._processAntiAffinity()
+
+    const hosts = this._getHosts()
+    const results = await this._getHostStatsAverages({
+      hosts,
+      toOptimizeOnly: true,
+    })

    if (!results) {
      return
    }

    const { averages, toOptimize } = results
-    const { hosts } = results

    toOptimize.sort((a, b) => {
      a = averages[a.id]
@@ -75,8 +80,8 @@ export default class PerformancePlan extends Plan {
  }

  async _optimize({ exceededHost, hosts, hostsAverages }) {
-    const vms = await this._getVms(exceededHost.id)
-    const vmsAverages = await this._getVmsAverages(vms, exceededHost)
+    const vms = filter(this._getAllRunningVms(), vm => vm.$container === exceededHost.id)
+    const vmsAverages = await this._getVmsAverages(vms, { [exceededHost.id]: exceededHost })

    // Sort vms by cpu usage. (lower to higher)
    vms.sort((a, b) => vmsAverages[b.id].cpu - vmsAverages[a.id].cpu)
@@ -121,6 +126,25 @@ export default class PerformancePlan extends Plan {
        continue
      }

+      if (!vm.xenTools) {
+        debug(`VM (${vm.id}) of Host (${exceededHost.id}) does not support pool migration.`)
+        continue
+      }
+
+      for (const tag of vm.tags) {
+        // TODO: Improve this piece of code. We could compute variance to check if the VM
+        // is migratable. But the code must be rewritten:
+        // - All VMs, hosts and stats must be fetched at one place.
+        // - It's necessary to maintain a dictionary of tags for each host.
+        // - ...
+        if (this._antiAffinityTags.includes(tag)) {
+          debug(
+            `VM (${vm.id}) of Host (${exceededHost.id}) cannot be migrated. It contains anti-affinity tag '${tag}'.`
+          )
+          continue
+        }
+      }
+
      exceededAverages.cpu -= vmAverages.cpu
      destinationAverages.cpu += vmAverages.cpu

--- a/packages/xo-server-load-balancer/src/plan.js
+++ b/packages/xo-server-load-balancer/src/plan.js
@@ -1,4 +1,5 @@
-import { filter, includes, map as mapToArray, size } from 'lodash'
+import { filter, groupBy, includes, isEmpty, keyBy, map as mapToArray, maxBy, minBy, size, sortBy } from 'lodash'
+import { inspect } from 'util'

 import { EXECUTION_DELAY, debug } from './utils'

@@ -94,7 +95,7 @@ function setRealCpuAverageOfVms(vms, vmsAverages, nCpus) {
 // ===================================================================

 export default class Plan {
-  constructor(xo, name, poolIds, { excludedHosts, thresholds } = {}) {
+  constructor(xo, name, poolIds, { excludedHosts, thresholds, antiAffinityTags } = {}) {
    this.xo = xo
    this._name = name
    this._poolIds = poolIds
@@ -107,6 +108,7 @@ export default class Plan {
        critical: numberOrDefault(thresholds && thresholds.memoryFree, DEFAULT_CRITICAL_THRESHOLD_MEMORY_FREE) * 1024,
      },
    }
+    this._antiAffinityTags = antiAffinityTags

    for (const key in this._thresholds) {
      const attr = this._thresholds[key]
@@ -130,36 +132,35 @@ export default class Plan {
  // Get hosts to optimize.
  // ===================================================================

-  async _findHostsToOptimize() {
-    const hosts = this._getHosts()
+  async _getHostStatsAverages({ hosts, toOptimizeOnly = false }) {
    const hostsStats = await this._getHostsStats(hosts, 'minutes')

-    // Check if a resource's utilization exceeds threshold.
    const avgNow = computeResourcesAverage(hosts, hostsStats, EXECUTION_DELAY)
-    let toOptimize = this._checkResourcesThresholds(hosts, avgNow)
-
-    // No resource's utilization problem.
-    if (toOptimize.length === 0) {
-      debug('No hosts to optimize.')
-      return
+    let toOptimize
+    if (toOptimizeOnly) {
+      // Check if a resource utilization exceeds threshold.
+      toOptimize = this._checkResourcesThresholds(hosts, avgNow)
+      if (toOptimize.length === 0) {
+        debug('No hosts to optimize.')
+        return
+      }
    }

-    // Check in the last 30 min interval with ratio.
    const avgBefore = computeResourcesAverage(hosts, hostsStats, MINUTES_OF_HISTORICAL_DATA)
    const avgWithRatio = computeResourcesAverageWithWeight(avgNow, avgBefore, 0.75)

-    toOptimize = this._checkResourcesThresholds(toOptimize, avgWithRatio)
-
-    // No resource's utilization problem.
-    if (toOptimize.length === 0) {
-      debug('No hosts to optimize.')
-      return
+    if (toOptimizeOnly) {
+      // Check in the last 30 min interval with ratio.
+      toOptimize = this._checkResourcesThresholds(toOptimize, avgWithRatio)
+      if (toOptimize.length === 0) {
+        debug('No hosts to optimize.')
+        return
+      }
    }

    return {
      toOptimize,
      averages: avgWithRatio,
-      hosts,
    }
  }

@@ -197,11 +198,8 @@ export default class Plan {
    )
  }

-  async _getVms(hostId) {
-    return filter(
-      this.xo.getObjects(),
-      object => object.type === 'VM' && object.power_state === 'Running' && object.$container === hostId
-    )
+  _getAllRunningVms() {
+    return filter(this.xo.getObjects(), object => object.type === 'VM' && object.power_state === 'Running')
  }

  // ===================================================================
@@ -244,7 +242,7 @@ export default class Plan {
    return vmsStats
  }

-  async _getVmsAverages(vms, host) {
+  async _getVmsAverages(vms, hosts) {
    const vmsStats = await this._getVmsStats(vms, 'minutes')
    const vmsAverages = computeResourcesAverageWithWeight(
      computeResourcesAverage(vms, vmsStats, EXECUTION_DELAY),
@@ -253,8 +251,265 @@ export default class Plan {
    )

    // Compute real CPU usage. Virtuals cpus to reals cpus.
-    setRealCpuAverageOfVms(vms, vmsAverages, host.CPUs.cpu_count)
+    for (const [hostId, hostVms] of Object.entries(groupBy(vms, '$container'))) {
+      setRealCpuAverageOfVms(hostVms, vmsAverages, hosts[hostId].CPUs.cpu_count)
+    }

    return vmsAverages
  }
+
+  // ===================================================================
+  // Anti-affinity helpers
+  // ===================================================================
+
+  async _processAntiAffinity() {
+    if (!this._antiAffinityTags.length) {
+      return
+    }
+
+    const allHosts = await this._getHosts()
+    if (allHosts.length <= 1) {
+      return
+    }
+    const idToHost = keyBy(allHosts, 'id')
+
+    const allVms = filter(this._getAllRunningVms(), vm => vm.$container in idToHost)
+    const taggedHosts = this._getAntiAffinityTaggedHosts(allHosts, allVms)
+
+    // 1. Check if we must migrate VMs...
+    const tagsDiff = {}
+    for (const watchedTag of this._antiAffinityTags) {
+      const getCount = fn => fn(taggedHosts.hosts, host => host.tags[watchedTag]).tags[watchedTag]
+      const diff = getCount(maxBy) - getCount(minBy)
+      if (diff > 1) {
+        tagsDiff[watchedTag] = diff - 1
+      }
+    }
+
+    if (isEmpty(tagsDiff)) {
+      return
+    }
+
+    // 2. Migrate!
+    debug('Try to apply anti-affinity policy.')
+    debug(`VM tag count per host: ${inspect(taggedHosts, { depth: null })}.`)
+    debug(`Tags diff: ${inspect(tagsDiff, { depth: null })}.`)
+
+    const vmsAverages = await this._getVmsAverages(allVms, idToHost)
+    const { averages: hostsAverages } = await this._getHostStatsAverages({ hosts: allHosts })
+
+    debug(`Hosts averages: ${inspect(hostsAverages, { depth: null })}.`)
+
+    const promises = []
+    for (const tag in tagsDiff) {
+      promises.push(...this._processAntiAffinityTag({ tag, vmsAverages, hostsAverages, taggedHosts, idToHost }))
+    }
+
+    // 3. Done!
+    debug(`VM tag count per host after migration: ${inspect(taggedHosts, { depth: null })}.`)
+    return Promise.all(promises)
+  }
+
+  _processAntiAffinityTag({ tag, vmsAverages, hostsAverages, taggedHosts, idToHost }) {
+    const promises = []
+
+    while (true) {
+      // 1. Find source host from which to migrate.
+      const sources = sortBy(
+        filter(taggedHosts.hosts, host => host.tags[tag] > 1),
+        [
+          host => host.tags[tag],
+          // Find host with the most memory used. Don't forget the "-". ;)
+          host => -hostsAverages[host.id].memoryFree,
+        ]
+      )
+
+      for (let sourceIndex = sources.length; sourceIndex >= 0; --sourceIndex) {
+        if (sourceIndex === 0) {
+          return promises // Nothing to migrate or we can't.
+        }
+
+        const sourceHost = sources[sourceIndex - 1]
+
+        // 2. Find destination host.
+        const destinations = sortBy(
+          filter(taggedHosts.hosts, host => host.id !== sourceHost.id && host.tags[tag] + 1 < sourceHost.tags[tag]),
+          [
+            host => host.tags[tag],
+            // Ideally it would be interesting to migrate in the same pool.
+            host => host.poolId !== sourceHost.poolId,
+            // Find host with the least memory used. Don't forget the "-". ;)
+            host => -hostsAverages[host.id].memoryFree,
+          ]
+        )
+        if (!destinations.length) {
+          return promises // Cannot find a valid destination.
+        }
+
+        // Build VM list to migrate.
+        // We try to migrate VMs with the targeted tag.
+        const sourceVms = filter(sourceHost.vms, vm => vm.tags.includes(tag))
+
+        let destinationHost
+        let vm
+        for (const destination of destinations) {
+          destinationHost = destination
+          debug(`Host candidate: ${sourceHost.id} -> ${destinationHost.id}.`)
+
+          const vms = filter(sourceVms, vm => hostsAverages[destinationHost.id].memoryFree >= vmsAverages[vm.id].memory)
+
+          debug(
+            `Tagged VM ("${tag}") candidates to migrate from host ${sourceHost.id}: ${inspect(mapToArray(vms, 'id'))}.`
+          )
+          vm = this._getAntiAffinityVmToMigrate({
+            vms,
+            vmsAverages,
+            hostsAverages,
+            taggedHosts,
+            sourceHost,
+            destinationHost,
+          })
+          if (vm) {
+            break
+          }
+        }
+
+        if (!vm) {
+          continue // If we can't find a VM to migrate, we must try with another source!
+        }
+        debug(`Migrate VM (${vm.id}) to Host (${destinationHost.id}) from Host (${sourceHost.id}).`)
+
+        // 3. Update tags and averages.
+        // This update can change the source host for the next migration.
+        for (const tag of vm.tags) {
+          if (this._antiAffinityTags.includes(tag)) {
+            sourceHost.tags[tag]--
+            destinationHost.tags[tag]++
+          }
+        }
+
+        const destinationAverages = hostsAverages[destinationHost.id]
+        const vmAverages = vmsAverages[vm.id]
+
+        destinationAverages.cpu += vmAverages.cpu
+        destinationAverages.memoryFree -= vmAverages.memory
+
+        delete sourceHost.vms[vm.id]
+
+        // 4. Migrate.
+        const destination = idToHost[destinationHost.id]
+        promises.push(
+          this.xo
+            .getXapi(idToHost[sourceHost.id])
+            .migrateVm(vm._xapiId, this.xo.getXapi(destination), destination._xapiId)
+        )
+
+        break // Continue with the same tag, the source can be different.
+      }
+    }
+  }
+
+  _getAntiAffinityTaggedHosts(hosts, vms) {
+    const tagCount = {}
+    for (const tag of this._antiAffinityTags) {
+      tagCount[tag] = 0
+    }
+
+    const taggedHosts = {}
+    for (const host of hosts) {
+      const tags = {}
+      for (const tag of this._antiAffinityTags) {
+        tags[tag] = 0
+      }
+
+      const taggedHost = (taggedHosts[host.id] = {
+        id: host.id,
+        poolId: host.$poolId,
+        tags,
+        vms: {},
+      })
+
+      // Hide properties when util.inspect is used.
+      Object.defineProperties(taggedHost, {
+        poolId: { enumerable: false },
+        vms: { enumerable: false }
+      })
+    }
+
+    for (const vm of vms) {
+      const hostId = vm.$container
+      if (!(hostId in taggedHosts)) {
+        continue
+      }
+
+      const taggedHost = taggedHosts[hostId]
+
+      for (const tag of vm.tags) {
+        if (this._antiAffinityTags.includes(tag)) {
+          tagCount[tag]++
+          taggedHost.tags[tag]++
+          taggedHost.vms[vm.id] = vm
+        }
+      }
+    }
+
+    return { tagCount, hosts: Object.values(taggedHosts) }
+  }
+
+  _computeAntiAffinityVariance(taggedHosts) {
+    // See: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+    let variance = 0
+
+    const { hosts } = taggedHosts
+    for (const tag in taggedHosts.tagCount) {
+      const k = hosts[0].tags[tag]
+
+      let ex = 0
+      let ex2 = 0
+
+      for (const host of hosts) {
+        const x = host.tags[tag]
+        const diff = x - k
+        ex += diff
+        ex2 += diff * diff
+      }
+
+      const n = hosts.length
+      variance += (ex2 - (ex * ex) / n) / n
+    }
+
+    return variance
+  }
+
+  _getAntiAffinityVmToMigrate({ vms, vmsAverages, hostsAverages, taggedHosts, sourceHost, destinationHost }) {
+    let bestVariance = this._computeAntiAffinityVariance(taggedHosts)
+    let bestVm
+
+    for (const vm of vms) {
+      const vmTags = filter(vm.tags, tag => this._antiAffinityTags.includes(tag))
+
+      for (const tag of vmTags) {
+        sourceHost.tags[tag]--
+        destinationHost.tags[tag]++
+      }
+
+      const variance = this._computeAntiAffinityVariance(taggedHosts)
+
+      for (const tag of vmTags) {
+        sourceHost.tags[tag]++
+        destinationHost.tags[tag]--
+      }
+
+      if (variance < bestVariance) {
+        if (vm.xenTools) {
+          bestVariance = variance
+          bestVm = vm
+        } else {
+          debug(`VM (${vm.id}) of Host (${sourceHost.id}) does not support pool migration.`)
+        }
+      }
+    }
+
+    return bestVm
+  }
 }
--- a/packages/xo-server-load-balancer/src/simple-plan.js
+++ b/packages/xo-server-load-balancer/src/simple-plan.js
@@ -0,0 +1,9 @@
+import Plan from './plan'
+
+// ===================================================================
+
+export default class SimplePlan extends Plan {
+  async execute() {
+    await this._processAntiAffinity()
+  }
+}