feat(xo-server): implement rolling pool reboot (#7242)

This commit is contained in:
Florent BEAUCHAMP 2024-01-25 17:50:34 +01:00 committed by GitHub
parent 5769da3ebc
commit d6abdb246b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 210 additions and 156 deletions

View File

@ -200,6 +200,31 @@ rollingUpdate.resolve = {
// -------------------------------------------------------------------
export async function rollingReboot({ bypassBackupCheck, pool }) {
const poolId = pool.id
if (bypassBackupCheck) {
log.warn('pool.rollingReboot update with argument "bypassBackupCheck" set to true', { poolId })
} else {
await backupGuard.call(this, poolId)
}
await this.rollingPoolReboot(pool)
}
rollingReboot.params = {
bypassBackupCheck: {
default: false,
type: 'boolean',
},
pool: { type: 'string' },
}
rollingReboot.resolve = {
pool: ['pool', 'pool', 'administrate'],
}
// -------------------------------------------------------------------
export async function getPatchesDifference({ source, target }) {
return this.getPatchesDifference(target.id, source.id)
}

View File

@ -1,7 +1,5 @@
import filter from 'lodash/filter.js'
import find from 'lodash/find.js'
import groupBy from 'lodash/groupBy.js'
import mapValues from 'lodash/mapValues.js'
import pickBy from 'lodash/pickBy.js'
import some from 'lodash/some.js'
import unzip from 'unzipper'
@ -9,15 +7,13 @@ import { asyncEach } from '@vates/async-each'
import { createLogger } from '@xen-orchestra/log'
import { decorateObject } from '@vates/decorate-with'
import { defer as deferrable } from 'golike-defer'
import { incorrectState } from 'xo-common/api-errors.js'
import { extractOpaqueRef, parseDateTime } from '@xen-orchestra/xapi'
import { timeout } from 'promise-toolbox'
import { extractOpaqueRef } from '@xen-orchestra/xapi'
import ensureArray from '../../_ensureArray.mjs'
import { debounceWithKey } from '../../_pDebounceWithKey.mjs'
import { forEach, mapFilter, parseXml } from '../../utils.mjs'
import { isHostRunning, useUpdateSystem } from '../utils.mjs'
import { useUpdateSystem } from '../utils.mjs'
// TOC -------------------------------------------------------------------------
@ -494,163 +490,33 @@ const methods = {
async rollingPoolUpdate($defer, { xsCredentials } = {}) {
const isXcp = _isXcp(this.pool.$master)
if (this.pool.ha_enabled) {
const haSrs = this.pool.$ha_statefiles.map(vdi => vdi.SR)
const haConfig = this.pool.ha_configuration
await this.call('pool.disable_ha')
$defer(() => this.call('pool.enable_ha', haSrs, haConfig))
}
const hosts = filter(this.objects.all, { $type: 'host' })
{
const deadHost = hosts.find(_ => !isHostRunning(_))
if (deadHost !== undefined) {
// reflect the interface of an XO host object
throw incorrectState({
actual: 'Halted',
expected: 'Running',
object: deadHost.$id,
property: 'power_state',
})
}
}
await Promise.all(hosts.map(host => host.$call('assert_can_evacuate')))
const hasMissingPatchesByHost = {}
const hosts = filter(this.objects.all, { $type: 'host' })
await asyncEach(hosts, async host => {
const hostUuid = host.uuid
const missingPatches = await this.listMissingPatches(hostUuid)
hasMissingPatchesByHost[hostUuid] = missingPatches.length > 0
})
// On XS/CH, start by installing patches on all hosts
if (!isXcp) {
log.debug('Install patches')
await this.installPatches({ xsCredentials })
}
// Remember on which hosts the running VMs are
const vmRefsByHost = mapValues(
groupBy(
filter(this.objects.all, {
$type: 'VM',
power_state: 'Running',
is_control_domain: false,
}),
vm => {
const hostId = vm.$resident_on?.$id
if (hostId === undefined) {
throw new Error('Could not find host of all running VMs')
}
return hostId
await this.rollingPoolReboot({
xsCredentials,
beforeEvacuateVms: async () => {
// On XS/CH, start by installing patches on all hosts
if (!isXcp) {
log.debug('Install patches')
await this.installPatches({ xsCredentials })
}
),
vms => vms.map(vm => vm.$ref)
)
// Put master in first position to restart it first
const indexOfMaster = hosts.findIndex(host => host.$ref === this.pool.master)
if (indexOfMaster === -1) {
throw new Error('Could not find pool master')
}
;[hosts[0], hosts[indexOfMaster]] = [hosts[indexOfMaster], hosts[0]]
// Restart all the hosts one by one
for (const host of hosts) {
const hostId = host.uuid
if (!hasMissingPatchesByHost[hostId]) {
continue
}
// This is an old metrics reference from before the pool master restart.
// The references don't seem to change but it's not guaranteed.
const metricsRef = host.metrics
await this.barrier(metricsRef)
await this._waitObjectState(metricsRef, metrics => metrics.live)
const getServerTime = async () => parseDateTime(await this.call('host.get_servertime', host.$ref)) * 1e3
let rebootTime
if (isXcp) {
// On XCP-ng, install patches on each host one by one instead of all at once
log.debug(`Evacuate host ${hostId}`)
await this.clearHost(host)
log.debug(`Install patches on host ${hostId}`)
await this.installPatches({ hosts: [host] })
log.debug(`Restart host ${hostId}`)
rebootTime = await getServerTime()
await this.callAsync('host.reboot', host.$ref)
} else {
// On XS/CH, we only need to evacuate/restart the hosts one by one since patches have already been installed
log.debug(`Evacuate and restart host ${hostId}`)
rebootTime = await getServerTime()
await this.rebootHost(hostId)
}
log.debug(`Wait for host ${hostId} to be up`)
await timeout.call(
(async () => {
await this._waitObjectState(
hostId,
host => host.enabled && rebootTime < host.other_config.agent_start_time * 1e3
)
await this._waitObjectState(metricsRef, metrics => metrics.live)
})(),
this._restartHostTimeout,
new Error(`Host ${hostId} took too long to restart`)
)
log.debug(`Host ${hostId} is up`)
}
if (some(hasMissingPatchesByHost)) {
log.debug('Migrate VMs back to where they were')
}
// Start with the last host since it's the emptiest one after the rolling
// update
;[hosts[0], hosts[hosts.length - 1]] = [hosts[hosts.length - 1], hosts[0]]
let error
for (const host of hosts) {
const hostId = host.uuid
if (!hasMissingPatchesByHost[hostId]) {
continue
}
const vmRefs = vmRefsByHost[hostId]
if (vmRefs === undefined) {
continue
}
// host.$resident_VMs is outdated and returns resident VMs before the host.evacuate.
// this.getField is used in order not to get cached data.
const residentVmRefs = await this.getField('host', host.$ref, 'resident_VMs')
for (const vmRef of vmRefs) {
if (residentVmRefs.includes(vmRef)) {
continue
},
beforeRebootHost: async host => {
if (isXcp) {
log.debug(`Install patches on host ${host.id}`)
await this.installPatches({ hosts: [host] })
}
try {
const vmId = await this.getField('VM', vmRef, 'uuid')
await this.migrateVm(vmId, this, hostId)
} catch (err) {
log.error(err)
if (error === undefined) {
error = err
}
}
}
}
if (error !== undefined) {
throw error
}
},
ignoreHost: host => {
return !hasMissingPatchesByHost[host.uuid]
},
})
},
}

View File

@ -1,7 +1,16 @@
import { cancelable, timeout } from 'promise-toolbox'
import { createLogger } from '@xen-orchestra/log'
import { decorateObject } from '@vates/decorate-with'
import { cancelable } from 'promise-toolbox'
import { defer as deferrable } from 'golike-defer'
import { incorrectState } from 'xo-common/api-errors.js'
import { isHostRunning } from '../utils.mjs'
import { parseDateTime } from '@xen-orchestra/xapi'
import filter from 'lodash/filter.js'
import groupBy from 'lodash/groupBy.js'
import mapValues from 'lodash/mapValues.js'
const PATH_DB_DUMP = '/pool/xmldbdump'
const log = createLogger('xo:xapi')
const methods = {
exportPoolMetadata($cancelToken) {
@ -21,9 +30,156 @@ const methods = {
task: this.task_create('Import pool metadata'),
})
},
async rollingPoolReboot($defer, { beforeEvacuateVms, beforeRebootHost, ignoreHost } = {}) {
if (this.pool.ha_enabled) {
const haSrs = this.pool.$ha_statefiles.map(vdi => vdi.SR)
const haConfig = this.pool.ha_configuration
await this.call('pool.disable_ha')
$defer(() => this.call('pool.enable_ha', haSrs, haConfig))
}
const hosts = filter(this.objects.all, { $type: 'host' })
{
const deadHost = hosts.find(_ => !isHostRunning(_))
if (deadHost !== undefined) {
// reflect the interface of an XO host object
throw incorrectState({
actual: 'Halted',
expected: 'Running',
object: deadHost.$id,
property: 'power_state',
})
}
}
await Promise.all(hosts.map(host => host.$call('assert_can_evacuate')))
if (beforeEvacuateVms) {
await beforeEvacuateVms()
}
// Remember on which hosts the running VMs are
const vmRefsByHost = mapValues(
groupBy(
filter(this.objects.all, {
$type: 'VM',
power_state: 'Running',
is_control_domain: false,
}),
vm => {
const hostId = vm.$resident_on?.$id
if (hostId === undefined) {
throw new Error('Could not find host of all running VMs')
}
return hostId
}
),
vms => vms.map(vm => vm.$ref)
)
// Put master in first position to restart it first
const indexOfMaster = hosts.findIndex(host => host.$ref === this.pool.master)
if (indexOfMaster === -1) {
throw new Error('Could not find pool master')
}
;[hosts[0], hosts[indexOfMaster]] = [hosts[indexOfMaster], hosts[0]]
let hasRestartedOne = false
// Restart all the hosts one by one
for (const host of hosts) {
const hostId = host.uuid
if (ignoreHost && ignoreHost(host)) {
continue
}
// This is an old metrics reference from before the pool master restart.
// The references don't seem to change but it's not guaranteed.
const metricsRef = host.metrics
await this.barrier(metricsRef)
await this._waitObjectState(metricsRef, metrics => metrics.live)
const getServerTime = async () => parseDateTime(await this.call('host.get_servertime', host.$ref)) * 1e3
log.debug(`Evacuate host ${hostId}`)
await this.clearHost(host)
if (beforeRebootHost) {
await beforeRebootHost(host)
}
log.debug(`Restart host ${hostId}`)
const rebootTime = await getServerTime()
await this.callAsync('host.reboot', host.$ref)
log.debug(`Wait for host ${hostId} to be up`)
await timeout.call(
(async () => {
await this._waitObjectState(
hostId,
host => host.enabled && rebootTime < host.other_config.agent_start_time * 1e3
)
await this._waitObjectState(metricsRef, metrics => metrics.live)
})(),
this._restartHostTimeout,
new Error(`Host ${hostId} took too long to restart`)
)
log.debug(`Host ${hostId} is up`)
hasRestartedOne = true
}
if (hasRestartedOne) {
log.debug('Migrate VMs back to where they were')
}
// Start with the last host since it's the emptiest one after the rolling
// update
;[hosts[0], hosts[hosts.length - 1]] = [hosts[hosts.length - 1], hosts[0]]
let error
for (const host of hosts) {
const hostId = host.uuid
if (ignoreHost && ignoreHost(host)) {
continue
}
const vmRefs = vmRefsByHost[hostId]
if (vmRefs === undefined) {
continue
}
// host.$resident_VMs is outdated and returns resident VMs before the host.evacuate.
// this.getField is used in order not to get cached data.
const residentVmRefs = await this.getField('host', host.$ref, 'resident_VMs')
for (const vmRef of vmRefs) {
if (residentVmRefs.includes(vmRef)) {
continue
}
try {
const vmId = await this.getField('VM', vmRef, 'uuid')
await this.migrateVm(vmId, this, hostId)
} catch (err) {
log.error(err)
if (error === undefined) {
error = err
}
}
}
}
if (error !== undefined) {
throw error
}
},
}
export default decorateObject(methods, {
exportPoolMetadata: cancelable,
importPoolMetadata: cancelable,
rollingPoolReboot: deferrable,
})

View File

@ -33,6 +33,7 @@ const AUTHORIZATIONS = {
LIST_MISSING_PATCHES: STARTER,
POOL_EMERGENCY_SHUTDOWN: ENTERPRISE,
ROLLING_POOL_UPDATE: ENTERPRISE,
ROLLING_POOL_REBOOT: ENTERPRISE,
}
export default class Authorization {

View File

@ -134,4 +134,10 @@ export default class Pools {
srsByPool[pool.id].some(sr => sr.size - sr.physical_usage >= minAvailableSrSize && checkSrName(sr.name_label))
)
}
async rollingPoolReboot(pool) {
const { _app } = this
await _app.checkFeatureAuthorization('ROLLING_POOL_REBOOT')
await _app.getXapi(pool).rollingPoolReboot()
}
}

View File

@ -641,7 +641,7 @@ export default class XenServers {
async rollingPoolUpdate($defer, pool) {
const app = this._app
await app.checkFeatureAuthorization('ROLLING_POOL_UPDATE')
const [schedules, jobs] = await Promise.all([app.getAllSchedules(), app.getAllJobs('backup')])
const poolId = pool.id