From 9ef05b8afede7eba85ed5cae32d1017d6d24f905 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Tue, 30 Mar 2021 17:25:41 +0200 Subject: [PATCH] feat(load-balancer): add new anti-affinity mode (#5652) Fixes #5600 --- CHANGELOG.unreleased.md | 3 + .../src/density-plan.js | 26 +- packages/xo-server-load-balancer/src/index.js | 28 +- .../src/performance-plan.js | 32 +- packages/xo-server-load-balancer/src/plan.js | 307 ++++++++++++++++-- .../src/simple-plan.js | 9 + 6 files changed, 365 insertions(+), 40 deletions(-) create mode 100644 packages/xo-server-load-balancer/src/simple-plan.js diff --git a/CHANGELOG.unreleased.md b/CHANGELOG.unreleased.md index a4b3a2c17..64f3f9680 100644 --- a/CHANGELOG.unreleased.md +++ b/CHANGELOG.unreleased.md @@ -5,6 +5,8 @@ ### Enhancements +- [Host/Load-balancer] Add a new anti-affinity mode (PR [#5652](https://github.com/vatesfr/xen-orchestra/pull/5652)) + ### Bug fixes ### Packages to release @@ -26,5 +28,6 @@ - @xen-orchestra/xapi minor - @xen-orchestra/backups minor +- xo-server-load-balancer minor - xo-server patch - xo-web minor diff --git a/packages/xo-server-load-balancer/src/density-plan.js b/packages/xo-server-load-balancer/src/density-plan.js index 3c7b53357..2579c2bec 100644 --- a/packages/xo-server-load-balancer/src/density-plan.js +++ b/packages/xo-server-load-balancer/src/density-plan.js @@ -15,13 +15,19 @@ export default class DensityPlan extends Plan { } async execute() { - const results = await this._findHostsToOptimize() + await this._processAntiAffinity() + + const hosts = this._getHosts() + const results = await this._getHostStatsAverages({ + hosts, + toOptimizeOnly: true, + }) if (!results) { return } - const { hosts, toOptimize } = results + const { toOptimize } = results let { averages: hostsAverages } = results @@ -94,14 +100,26 @@ export default class DensityPlan extends Plan { debug(`Try to optimize Host (${hostId}).`) - const vms = await this._getVms(hostId) - const vmsAverages = await this._getVmsAverages(vms, host) + const vms = filter(this._getAllRunningVms(), vm => vm.$container === hostId) + const vmsAverages = await this._getVmsAverages(vms, { [host.id]: host }) for (const vm of vms) { if (!vm.xenTools) { debug(`VM (${vm.id}) of Host (${hostId}) does not support pool migration.`) return } + + for (const tag of vm.tags) { + // TODO: Improve this piece of code. We could compute variance to check if the VM + // is migratable. But the code must be rewritten: + // - All VMs, hosts and stats must be fetched at one place. + // - It's necessary to maintain a dictionary of tags for each host. + // - ... + if (this._antiAffinityTags.includes(tag)) { + debug(`VM (${vm.id}) of Host (${hostId}) cannot be migrated. It contains anti-affinity tag '${tag}'.`) + return + } + } } // Sort vms by amount of memory. (+ -> -) diff --git a/packages/xo-server-load-balancer/src/index.js b/packages/xo-server-load-balancer/src/index.js index f9c47d77b..fba716a89 100644 --- a/packages/xo-server-load-balancer/src/index.js +++ b/packages/xo-server-load-balancer/src/index.js @@ -3,6 +3,7 @@ import { intersection, uniq } from 'lodash' import DensityPlan from './density-plan' import PerformancePlan from './performance-plan' +import SimplePlan from './simple-plan' import { DEFAULT_CRITICAL_THRESHOLD_CPU, DEFAULT_CRITICAL_THRESHOLD_MEMORY_FREE } from './plan' import { EXECUTION_DELAY, debug } from './utils' @@ -33,7 +34,7 @@ export const configurationSchema = { }, mode: { - enum: ['Performance mode', 'Density mode'], + enum: ['Performance mode', 'Density mode', 'Simple mode'], title: 'Mode', }, @@ -75,6 +76,17 @@ export const configurationSchema = { $type: 'Host', }, }, + + antiAffinityTags: { + type: 'array', + title: 'Anti-affinity tags', + description: 'list of VM tags to force place VMs on different hosts', + + items: { + type: 'string', + $type: 'Tag', + }, + }, }, required: ['name', 'mode', 'pools'], @@ -130,11 +142,15 @@ class LoadBalancerPlugin { } this._poolIds = this._poolIds.concat(pools) - this._plans.push( - mode === PERFORMANCE_MODE - ? new PerformancePlan(this.xo, name, pools, options) - : new DensityPlan(this.xo, name, pools, options) - ) + let plan + if (mode === PERFORMANCE_MODE) { + plan = new PerformancePlan(this.xo, name, pools, options) + } else if (mode === DENSITY_MODE) { + plan = new DensityPlan(this.xo, name, pools, options) + } else { + plan = new SimplePlan(this.xo, name, pools, options) + } + this._plans.push(plan) } _executePlans() { diff --git a/packages/xo-server-load-balancer/src/performance-plan.js b/packages/xo-server-load-balancer/src/performance-plan.js index 4467c85f4..169546a15 100644 --- a/packages/xo-server-load-balancer/src/performance-plan.js +++ b/packages/xo-server-load-balancer/src/performance-plan.js @@ -42,14 +42,19 @@ export default class PerformancePlan extends Plan { console.error(error) } - const results = await this._findHostsToOptimize() + await this._processAntiAffinity() + + const hosts = this._getHosts() + const results = await this._getHostStatsAverages({ + hosts, + toOptimizeOnly: true, + }) if (!results) { return } const { averages, toOptimize } = results - const { hosts } = results toOptimize.sort((a, b) => { a = averages[a.id] @@ -75,8 +80,8 @@ export default class PerformancePlan extends Plan { } async _optimize({ exceededHost, hosts, hostsAverages }) { - const vms = await this._getVms(exceededHost.id) - const vmsAverages = await this._getVmsAverages(vms, exceededHost) + const vms = filter(this._getAllRunningVms(), vm => vm.$container === exceededHost.id) + const vmsAverages = await this._getVmsAverages(vms, { [exceededHost.id]: exceededHost }) // Sort vms by cpu usage. (lower to higher) vms.sort((a, b) => vmsAverages[b.id].cpu - vmsAverages[a.id].cpu) @@ -121,6 +126,25 @@ export default class PerformancePlan extends Plan { continue } + if (!vm.xenTools) { + debug(`VM (${vm.id}) of Host (${exceededHost.id}) does not support pool migration.`) + continue + } + + for (const tag of vm.tags) { + // TODO: Improve this piece of code. We could compute variance to check if the VM + // is migratable. But the code must be rewritten: + // - All VMs, hosts and stats must be fetched at one place. + // - It's necessary to maintain a dictionary of tags for each host. + // - ... + if (this._antiAffinityTags.includes(tag)) { + debug( + `VM (${vm.id}) of Host (${exceededHost.id}) cannot be migrated. It contains anti-affinity tag '${tag}'.` + ) + continue + } + } + exceededAverages.cpu -= vmAverages.cpu destinationAverages.cpu += vmAverages.cpu diff --git a/packages/xo-server-load-balancer/src/plan.js b/packages/xo-server-load-balancer/src/plan.js index 1a1a6c8a5..6a031612b 100644 --- a/packages/xo-server-load-balancer/src/plan.js +++ b/packages/xo-server-load-balancer/src/plan.js @@ -1,4 +1,5 @@ -import { filter, includes, map as mapToArray, size } from 'lodash' +import { filter, groupBy, includes, isEmpty, keyBy, map as mapToArray, maxBy, minBy, size, sortBy } from 'lodash' +import { inspect } from 'util' import { EXECUTION_DELAY, debug } from './utils' @@ -94,7 +95,7 @@ function setRealCpuAverageOfVms(vms, vmsAverages, nCpus) { // =================================================================== export default class Plan { - constructor(xo, name, poolIds, { excludedHosts, thresholds } = {}) { + constructor(xo, name, poolIds, { excludedHosts, thresholds, antiAffinityTags } = {}) { this.xo = xo this._name = name this._poolIds = poolIds @@ -107,6 +108,7 @@ export default class Plan { critical: numberOrDefault(thresholds && thresholds.memoryFree, DEFAULT_CRITICAL_THRESHOLD_MEMORY_FREE) * 1024, }, } + this._antiAffinityTags = antiAffinityTags for (const key in this._thresholds) { const attr = this._thresholds[key] @@ -130,36 +132,35 @@ export default class Plan { // Get hosts to optimize. // =================================================================== - async _findHostsToOptimize() { - const hosts = this._getHosts() + async _getHostStatsAverages({ hosts, toOptimizeOnly = false }) { const hostsStats = await this._getHostsStats(hosts, 'minutes') - // Check if a resource's utilization exceeds threshold. const avgNow = computeResourcesAverage(hosts, hostsStats, EXECUTION_DELAY) - let toOptimize = this._checkResourcesThresholds(hosts, avgNow) - - // No resource's utilization problem. - if (toOptimize.length === 0) { - debug('No hosts to optimize.') - return + let toOptimize + if (toOptimizeOnly) { + // Check if a resource utilization exceeds threshold. + toOptimize = this._checkResourcesThresholds(hosts, avgNow) + if (toOptimize.length === 0) { + debug('No hosts to optimize.') + return + } } - // Check in the last 30 min interval with ratio. const avgBefore = computeResourcesAverage(hosts, hostsStats, MINUTES_OF_HISTORICAL_DATA) const avgWithRatio = computeResourcesAverageWithWeight(avgNow, avgBefore, 0.75) - toOptimize = this._checkResourcesThresholds(toOptimize, avgWithRatio) - - // No resource's utilization problem. - if (toOptimize.length === 0) { - debug('No hosts to optimize.') - return + if (toOptimizeOnly) { + // Check in the last 30 min interval with ratio. + toOptimize = this._checkResourcesThresholds(toOptimize, avgWithRatio) + if (toOptimize.length === 0) { + debug('No hosts to optimize.') + return + } } return { toOptimize, averages: avgWithRatio, - hosts, } } @@ -197,11 +198,8 @@ export default class Plan { ) } - async _getVms(hostId) { - return filter( - this.xo.getObjects(), - object => object.type === 'VM' && object.power_state === 'Running' && object.$container === hostId - ) + _getAllRunningVms() { + return filter(this.xo.getObjects(), object => object.type === 'VM' && object.power_state === 'Running') } // =================================================================== @@ -244,7 +242,7 @@ export default class Plan { return vmsStats } - async _getVmsAverages(vms, host) { + async _getVmsAverages(vms, hosts) { const vmsStats = await this._getVmsStats(vms, 'minutes') const vmsAverages = computeResourcesAverageWithWeight( computeResourcesAverage(vms, vmsStats, EXECUTION_DELAY), @@ -253,8 +251,265 @@ export default class Plan { ) // Compute real CPU usage. Virtuals cpus to reals cpus. - setRealCpuAverageOfVms(vms, vmsAverages, host.CPUs.cpu_count) + for (const [hostId, hostVms] of Object.entries(groupBy(vms, '$container'))) { + setRealCpuAverageOfVms(hostVms, vmsAverages, hosts[hostId].CPUs.cpu_count) + } return vmsAverages } + + // =================================================================== + // Anti-affinity helpers + // =================================================================== + + async _processAntiAffinity() { + if (!this._antiAffinityTags.length) { + return + } + + const allHosts = await this._getHosts() + if (allHosts.length <= 1) { + return + } + const idToHost = keyBy(allHosts, 'id') + + const allVms = filter(this._getAllRunningVms(), vm => vm.$container in idToHost) + const taggedHosts = this._getAntiAffinityTaggedHosts(allHosts, allVms) + + // 1. Check if we must migrate VMs... + const tagsDiff = {} + for (const watchedTag of this._antiAffinityTags) { + const getCount = fn => fn(taggedHosts.hosts, host => host.tags[watchedTag]).tags[watchedTag] + const diff = getCount(maxBy) - getCount(minBy) + if (diff > 1) { + tagsDiff[watchedTag] = diff - 1 + } + } + + if (isEmpty(tagsDiff)) { + return + } + + // 2. Migrate! + debug('Try to apply anti-affinity policy.') + debug(`VM tag count per host: ${inspect(taggedHosts, { depth: null })}.`) + debug(`Tags diff: ${inspect(tagsDiff, { depth: null })}.`) + + const vmsAverages = await this._getVmsAverages(allVms, idToHost) + const { averages: hostsAverages } = await this._getHostStatsAverages({ hosts: allHosts }) + + debug(`Hosts averages: ${inspect(hostsAverages, { depth: null })}.`) + + const promises = [] + for (const tag in tagsDiff) { + promises.push(...this._processAntiAffinityTag({ tag, vmsAverages, hostsAverages, taggedHosts, idToHost })) + } + + // 3. Done! + debug(`VM tag count per host after migration: ${inspect(taggedHosts, { depth: null })}.`) + return Promise.all(promises) + } + + _processAntiAffinityTag({ tag, vmsAverages, hostsAverages, taggedHosts, idToHost }) { + const promises = [] + + while (true) { + // 1. Find source host from which to migrate. + const sources = sortBy( + filter(taggedHosts.hosts, host => host.tags[tag] > 1), + [ + host => host.tags[tag], + // Find host with the most memory used. Don't forget the "-". ;) + host => -hostsAverages[host.id].memoryFree, + ] + ) + + for (let sourceIndex = sources.length; sourceIndex >= 0; --sourceIndex) { + if (sourceIndex === 0) { + return promises // Nothing to migrate or we can't. + } + + const sourceHost = sources[sourceIndex - 1] + + // 2. Find destination host. + const destinations = sortBy( + filter(taggedHosts.hosts, host => host.id !== sourceHost.id && host.tags[tag] + 1 < sourceHost.tags[tag]), + [ + host => host.tags[tag], + // Ideally it would be interesting to migrate in the same pool. + host => host.poolId !== sourceHost.poolId, + // Find host with the least memory used. Don't forget the "-". ;) + host => -hostsAverages[host.id].memoryFree, + ] + ) + if (!destinations.length) { + return promises // Cannot find a valid destination. + } + + // Build VM list to migrate. + // We try to migrate VMs with the targeted tag. + const sourceVms = filter(sourceHost.vms, vm => vm.tags.includes(tag)) + + let destinationHost + let vm + for (const destination of destinations) { + destinationHost = destination + debug(`Host candidate: ${sourceHost.id} -> ${destinationHost.id}.`) + + const vms = filter(sourceVms, vm => hostsAverages[destinationHost.id].memoryFree >= vmsAverages[vm.id].memory) + + debug( + `Tagged VM ("${tag}") candidates to migrate from host ${sourceHost.id}: ${inspect(mapToArray(vms, 'id'))}.` + ) + vm = this._getAntiAffinityVmToMigrate({ + vms, + vmsAverages, + hostsAverages, + taggedHosts, + sourceHost, + destinationHost, + }) + if (vm) { + break + } + } + + if (!vm) { + continue // If we can't find a VM to migrate, we must try with another source! + } + debug(`Migrate VM (${vm.id}) to Host (${destinationHost.id}) from Host (${sourceHost.id}).`) + + // 3. Update tags and averages. + // This update can change the source host for the next migration. + for (const tag of vm.tags) { + if (this._antiAffinityTags.includes(tag)) { + sourceHost.tags[tag]-- + destinationHost.tags[tag]++ + } + } + + const destinationAverages = hostsAverages[destinationHost.id] + const vmAverages = vmsAverages[vm.id] + + destinationAverages.cpu += vmAverages.cpu + destinationAverages.memoryFree -= vmAverages.memory + + delete sourceHost.vms[vm.id] + + // 4. Migrate. + const destination = idToHost[destinationHost.id] + promises.push( + this.xo + .getXapi(idToHost[sourceHost.id]) + .migrateVm(vm._xapiId, this.xo.getXapi(destination), destination._xapiId) + ) + + break // Continue with the same tag, the source can be different. + } + } + } + + _getAntiAffinityTaggedHosts(hosts, vms) { + const tagCount = {} + for (const tag of this._antiAffinityTags) { + tagCount[tag] = 0 + } + + const taggedHosts = {} + for (const host of hosts) { + const tags = {} + for (const tag of this._antiAffinityTags) { + tags[tag] = 0 + } + + const taggedHost = (taggedHosts[host.id] = { + id: host.id, + poolId: host.$poolId, + tags, + vms: {}, + }) + + // Hide properties when util.inspect is used. + Object.defineProperties(taggedHost, { + poolId: { enumerable: false }, + vms: { enumerable: false } + }) + } + + for (const vm of vms) { + const hostId = vm.$container + if (!(hostId in taggedHosts)) { + continue + } + + const taggedHost = taggedHosts[hostId] + + for (const tag of vm.tags) { + if (this._antiAffinityTags.includes(tag)) { + tagCount[tag]++ + taggedHost.tags[tag]++ + taggedHost.vms[vm.id] = vm + } + } + } + + return { tagCount, hosts: Object.values(taggedHosts) } + } + + _computeAntiAffinityVariance(taggedHosts) { + // See: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + let variance = 0 + + const { hosts } = taggedHosts + for (const tag in taggedHosts.tagCount) { + const k = hosts[0].tags[tag] + + let ex = 0 + let ex2 = 0 + + for (const host of hosts) { + const x = host.tags[tag] + const diff = x - k + ex += diff + ex2 += diff * diff + } + + const n = hosts.length + variance += (ex2 - (ex * ex) / n) / n + } + + return variance + } + + _getAntiAffinityVmToMigrate({ vms, vmsAverages, hostsAverages, taggedHosts, sourceHost, destinationHost }) { + let bestVariance = this._computeAntiAffinityVariance(taggedHosts) + let bestVm + + for (const vm of vms) { + const vmTags = filter(vm.tags, tag => this._antiAffinityTags.includes(tag)) + + for (const tag of vmTags) { + sourceHost.tags[tag]-- + destinationHost.tags[tag]++ + } + + const variance = this._computeAntiAffinityVariance(taggedHosts) + + for (const tag of vmTags) { + sourceHost.tags[tag]++ + destinationHost.tags[tag]-- + } + + if (variance < bestVariance) { + if (vm.xenTools) { + bestVariance = variance + bestVm = vm + } else { + debug(`VM (${vm.id}) of Host (${sourceHost.id}) does not support pool migration.`) + } + } + } + + return bestVm + } } diff --git a/packages/xo-server-load-balancer/src/simple-plan.js b/packages/xo-server-load-balancer/src/simple-plan.js new file mode 100644 index 000000000..c49f3abcb --- /dev/null +++ b/packages/xo-server-load-balancer/src/simple-plan.js @@ -0,0 +1,9 @@ +import Plan from './plan' + +// =================================================================== + +export default class SimplePlan extends Plan { + async execute() { + await this._processAntiAffinity() + } +}