feat(xo-server): implement rolling pool reboot (#7242)
This commit is contained in:
parent
5769da3ebc
commit
d6abdb246b
@ -200,6 +200,31 @@ rollingUpdate.resolve = {
|
||||
|
||||
// -------------------------------------------------------------------
|
||||
|
||||
export async function rollingReboot({ bypassBackupCheck, pool }) {
|
||||
const poolId = pool.id
|
||||
if (bypassBackupCheck) {
|
||||
log.warn('pool.rollingReboot update with argument "bypassBackupCheck" set to true', { poolId })
|
||||
} else {
|
||||
await backupGuard.call(this, poolId)
|
||||
}
|
||||
|
||||
await this.rollingPoolReboot(pool)
|
||||
}
|
||||
|
||||
rollingReboot.params = {
|
||||
bypassBackupCheck: {
|
||||
default: false,
|
||||
type: 'boolean',
|
||||
},
|
||||
pool: { type: 'string' },
|
||||
}
|
||||
|
||||
rollingReboot.resolve = {
|
||||
pool: ['pool', 'pool', 'administrate'],
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------
|
||||
|
||||
export async function getPatchesDifference({ source, target }) {
|
||||
return this.getPatchesDifference(target.id, source.id)
|
||||
}
|
||||
|
@ -1,7 +1,5 @@
|
||||
import filter from 'lodash/filter.js'
|
||||
import find from 'lodash/find.js'
|
||||
import groupBy from 'lodash/groupBy.js'
|
||||
import mapValues from 'lodash/mapValues.js'
|
||||
import pickBy from 'lodash/pickBy.js'
|
||||
import some from 'lodash/some.js'
|
||||
import unzip from 'unzipper'
|
||||
@ -9,15 +7,13 @@ import { asyncEach } from '@vates/async-each'
|
||||
import { createLogger } from '@xen-orchestra/log'
|
||||
import { decorateObject } from '@vates/decorate-with'
|
||||
import { defer as deferrable } from 'golike-defer'
|
||||
import { incorrectState } from 'xo-common/api-errors.js'
|
||||
import { extractOpaqueRef, parseDateTime } from '@xen-orchestra/xapi'
|
||||
import { timeout } from 'promise-toolbox'
|
||||
import { extractOpaqueRef } from '@xen-orchestra/xapi'
|
||||
|
||||
import ensureArray from '../../_ensureArray.mjs'
|
||||
import { debounceWithKey } from '../../_pDebounceWithKey.mjs'
|
||||
import { forEach, mapFilter, parseXml } from '../../utils.mjs'
|
||||
|
||||
import { isHostRunning, useUpdateSystem } from '../utils.mjs'
|
||||
import { useUpdateSystem } from '../utils.mjs'
|
||||
|
||||
// TOC -------------------------------------------------------------------------
|
||||
|
||||
@ -494,163 +490,33 @@ const methods = {
|
||||
async rollingPoolUpdate($defer, { xsCredentials } = {}) {
|
||||
const isXcp = _isXcp(this.pool.$master)
|
||||
|
||||
if (this.pool.ha_enabled) {
|
||||
const haSrs = this.pool.$ha_statefiles.map(vdi => vdi.SR)
|
||||
const haConfig = this.pool.ha_configuration
|
||||
await this.call('pool.disable_ha')
|
||||
$defer(() => this.call('pool.enable_ha', haSrs, haConfig))
|
||||
}
|
||||
|
||||
const hosts = filter(this.objects.all, { $type: 'host' })
|
||||
|
||||
{
|
||||
const deadHost = hosts.find(_ => !isHostRunning(_))
|
||||
if (deadHost !== undefined) {
|
||||
// reflect the interface of an XO host object
|
||||
throw incorrectState({
|
||||
actual: 'Halted',
|
||||
expected: 'Running',
|
||||
object: deadHost.$id,
|
||||
property: 'power_state',
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
await Promise.all(hosts.map(host => host.$call('assert_can_evacuate')))
|
||||
|
||||
const hasMissingPatchesByHost = {}
|
||||
const hosts = filter(this.objects.all, { $type: 'host' })
|
||||
await asyncEach(hosts, async host => {
|
||||
const hostUuid = host.uuid
|
||||
const missingPatches = await this.listMissingPatches(hostUuid)
|
||||
hasMissingPatchesByHost[hostUuid] = missingPatches.length > 0
|
||||
})
|
||||
|
||||
// On XS/CH, start by installing patches on all hosts
|
||||
if (!isXcp) {
|
||||
log.debug('Install patches')
|
||||
await this.installPatches({ xsCredentials })
|
||||
}
|
||||
|
||||
// Remember on which hosts the running VMs are
|
||||
const vmRefsByHost = mapValues(
|
||||
groupBy(
|
||||
filter(this.objects.all, {
|
||||
$type: 'VM',
|
||||
power_state: 'Running',
|
||||
is_control_domain: false,
|
||||
}),
|
||||
vm => {
|
||||
const hostId = vm.$resident_on?.$id
|
||||
|
||||
if (hostId === undefined) {
|
||||
throw new Error('Could not find host of all running VMs')
|
||||
}
|
||||
|
||||
return hostId
|
||||
await this.rollingPoolReboot({
|
||||
xsCredentials,
|
||||
beforeEvacuateVms: async () => {
|
||||
// On XS/CH, start by installing patches on all hosts
|
||||
if (!isXcp) {
|
||||
log.debug('Install patches')
|
||||
await this.installPatches({ xsCredentials })
|
||||
}
|
||||
),
|
||||
vms => vms.map(vm => vm.$ref)
|
||||
)
|
||||
|
||||
// Put master in first position to restart it first
|
||||
const indexOfMaster = hosts.findIndex(host => host.$ref === this.pool.master)
|
||||
if (indexOfMaster === -1) {
|
||||
throw new Error('Could not find pool master')
|
||||
}
|
||||
;[hosts[0], hosts[indexOfMaster]] = [hosts[indexOfMaster], hosts[0]]
|
||||
|
||||
// Restart all the hosts one by one
|
||||
for (const host of hosts) {
|
||||
const hostId = host.uuid
|
||||
if (!hasMissingPatchesByHost[hostId]) {
|
||||
continue
|
||||
}
|
||||
|
||||
// This is an old metrics reference from before the pool master restart.
|
||||
// The references don't seem to change but it's not guaranteed.
|
||||
const metricsRef = host.metrics
|
||||
|
||||
await this.barrier(metricsRef)
|
||||
await this._waitObjectState(metricsRef, metrics => metrics.live)
|
||||
|
||||
const getServerTime = async () => parseDateTime(await this.call('host.get_servertime', host.$ref)) * 1e3
|
||||
let rebootTime
|
||||
if (isXcp) {
|
||||
// On XCP-ng, install patches on each host one by one instead of all at once
|
||||
log.debug(`Evacuate host ${hostId}`)
|
||||
await this.clearHost(host)
|
||||
log.debug(`Install patches on host ${hostId}`)
|
||||
await this.installPatches({ hosts: [host] })
|
||||
log.debug(`Restart host ${hostId}`)
|
||||
rebootTime = await getServerTime()
|
||||
await this.callAsync('host.reboot', host.$ref)
|
||||
} else {
|
||||
// On XS/CH, we only need to evacuate/restart the hosts one by one since patches have already been installed
|
||||
log.debug(`Evacuate and restart host ${hostId}`)
|
||||
rebootTime = await getServerTime()
|
||||
await this.rebootHost(hostId)
|
||||
}
|
||||
|
||||
log.debug(`Wait for host ${hostId} to be up`)
|
||||
await timeout.call(
|
||||
(async () => {
|
||||
await this._waitObjectState(
|
||||
hostId,
|
||||
host => host.enabled && rebootTime < host.other_config.agent_start_time * 1e3
|
||||
)
|
||||
await this._waitObjectState(metricsRef, metrics => metrics.live)
|
||||
})(),
|
||||
this._restartHostTimeout,
|
||||
new Error(`Host ${hostId} took too long to restart`)
|
||||
)
|
||||
log.debug(`Host ${hostId} is up`)
|
||||
}
|
||||
|
||||
if (some(hasMissingPatchesByHost)) {
|
||||
log.debug('Migrate VMs back to where they were')
|
||||
}
|
||||
|
||||
// Start with the last host since it's the emptiest one after the rolling
|
||||
// update
|
||||
;[hosts[0], hosts[hosts.length - 1]] = [hosts[hosts.length - 1], hosts[0]]
|
||||
|
||||
let error
|
||||
for (const host of hosts) {
|
||||
const hostId = host.uuid
|
||||
if (!hasMissingPatchesByHost[hostId]) {
|
||||
continue
|
||||
}
|
||||
|
||||
const vmRefs = vmRefsByHost[hostId]
|
||||
|
||||
if (vmRefs === undefined) {
|
||||
continue
|
||||
}
|
||||
|
||||
// host.$resident_VMs is outdated and returns resident VMs before the host.evacuate.
|
||||
// this.getField is used in order not to get cached data.
|
||||
const residentVmRefs = await this.getField('host', host.$ref, 'resident_VMs')
|
||||
|
||||
for (const vmRef of vmRefs) {
|
||||
if (residentVmRefs.includes(vmRef)) {
|
||||
continue
|
||||
},
|
||||
beforeRebootHost: async host => {
|
||||
if (isXcp) {
|
||||
log.debug(`Install patches on host ${host.id}`)
|
||||
await this.installPatches({ hosts: [host] })
|
||||
}
|
||||
|
||||
try {
|
||||
const vmId = await this.getField('VM', vmRef, 'uuid')
|
||||
await this.migrateVm(vmId, this, hostId)
|
||||
} catch (err) {
|
||||
log.error(err)
|
||||
if (error === undefined) {
|
||||
error = err
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (error !== undefined) {
|
||||
throw error
|
||||
}
|
||||
},
|
||||
ignoreHost: host => {
|
||||
return !hasMissingPatchesByHost[host.uuid]
|
||||
},
|
||||
})
|
||||
},
|
||||
}
|
||||
|
||||
|
@ -1,7 +1,16 @@
|
||||
import { cancelable, timeout } from 'promise-toolbox'
|
||||
import { createLogger } from '@xen-orchestra/log'
|
||||
import { decorateObject } from '@vates/decorate-with'
|
||||
import { cancelable } from 'promise-toolbox'
|
||||
import { defer as deferrable } from 'golike-defer'
|
||||
import { incorrectState } from 'xo-common/api-errors.js'
|
||||
import { isHostRunning } from '../utils.mjs'
|
||||
import { parseDateTime } from '@xen-orchestra/xapi'
|
||||
import filter from 'lodash/filter.js'
|
||||
import groupBy from 'lodash/groupBy.js'
|
||||
import mapValues from 'lodash/mapValues.js'
|
||||
|
||||
const PATH_DB_DUMP = '/pool/xmldbdump'
|
||||
const log = createLogger('xo:xapi')
|
||||
|
||||
const methods = {
|
||||
exportPoolMetadata($cancelToken) {
|
||||
@ -21,9 +30,156 @@ const methods = {
|
||||
task: this.task_create('Import pool metadata'),
|
||||
})
|
||||
},
|
||||
|
||||
async rollingPoolReboot($defer, { beforeEvacuateVms, beforeRebootHost, ignoreHost } = {}) {
|
||||
if (this.pool.ha_enabled) {
|
||||
const haSrs = this.pool.$ha_statefiles.map(vdi => vdi.SR)
|
||||
const haConfig = this.pool.ha_configuration
|
||||
await this.call('pool.disable_ha')
|
||||
$defer(() => this.call('pool.enable_ha', haSrs, haConfig))
|
||||
}
|
||||
|
||||
const hosts = filter(this.objects.all, { $type: 'host' })
|
||||
|
||||
{
|
||||
const deadHost = hosts.find(_ => !isHostRunning(_))
|
||||
if (deadHost !== undefined) {
|
||||
// reflect the interface of an XO host object
|
||||
throw incorrectState({
|
||||
actual: 'Halted',
|
||||
expected: 'Running',
|
||||
object: deadHost.$id,
|
||||
property: 'power_state',
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
await Promise.all(hosts.map(host => host.$call('assert_can_evacuate')))
|
||||
|
||||
if (beforeEvacuateVms) {
|
||||
await beforeEvacuateVms()
|
||||
}
|
||||
// Remember on which hosts the running VMs are
|
||||
const vmRefsByHost = mapValues(
|
||||
groupBy(
|
||||
filter(this.objects.all, {
|
||||
$type: 'VM',
|
||||
power_state: 'Running',
|
||||
is_control_domain: false,
|
||||
}),
|
||||
vm => {
|
||||
const hostId = vm.$resident_on?.$id
|
||||
|
||||
if (hostId === undefined) {
|
||||
throw new Error('Could not find host of all running VMs')
|
||||
}
|
||||
|
||||
return hostId
|
||||
}
|
||||
),
|
||||
vms => vms.map(vm => vm.$ref)
|
||||
)
|
||||
|
||||
// Put master in first position to restart it first
|
||||
const indexOfMaster = hosts.findIndex(host => host.$ref === this.pool.master)
|
||||
if (indexOfMaster === -1) {
|
||||
throw new Error('Could not find pool master')
|
||||
}
|
||||
;[hosts[0], hosts[indexOfMaster]] = [hosts[indexOfMaster], hosts[0]]
|
||||
|
||||
let hasRestartedOne = false
|
||||
// Restart all the hosts one by one
|
||||
for (const host of hosts) {
|
||||
const hostId = host.uuid
|
||||
if (ignoreHost && ignoreHost(host)) {
|
||||
continue
|
||||
}
|
||||
|
||||
// This is an old metrics reference from before the pool master restart.
|
||||
// The references don't seem to change but it's not guaranteed.
|
||||
const metricsRef = host.metrics
|
||||
|
||||
await this.barrier(metricsRef)
|
||||
await this._waitObjectState(metricsRef, metrics => metrics.live)
|
||||
|
||||
const getServerTime = async () => parseDateTime(await this.call('host.get_servertime', host.$ref)) * 1e3
|
||||
log.debug(`Evacuate host ${hostId}`)
|
||||
await this.clearHost(host)
|
||||
|
||||
if (beforeRebootHost) {
|
||||
await beforeRebootHost(host)
|
||||
}
|
||||
|
||||
log.debug(`Restart host ${hostId}`)
|
||||
const rebootTime = await getServerTime()
|
||||
await this.callAsync('host.reboot', host.$ref)
|
||||
|
||||
log.debug(`Wait for host ${hostId} to be up`)
|
||||
await timeout.call(
|
||||
(async () => {
|
||||
await this._waitObjectState(
|
||||
hostId,
|
||||
host => host.enabled && rebootTime < host.other_config.agent_start_time * 1e3
|
||||
)
|
||||
await this._waitObjectState(metricsRef, metrics => metrics.live)
|
||||
})(),
|
||||
this._restartHostTimeout,
|
||||
new Error(`Host ${hostId} took too long to restart`)
|
||||
)
|
||||
log.debug(`Host ${hostId} is up`)
|
||||
hasRestartedOne = true
|
||||
}
|
||||
|
||||
if (hasRestartedOne) {
|
||||
log.debug('Migrate VMs back to where they were')
|
||||
}
|
||||
|
||||
// Start with the last host since it's the emptiest one after the rolling
|
||||
// update
|
||||
;[hosts[0], hosts[hosts.length - 1]] = [hosts[hosts.length - 1], hosts[0]]
|
||||
|
||||
let error
|
||||
for (const host of hosts) {
|
||||
const hostId = host.uuid
|
||||
if (ignoreHost && ignoreHost(host)) {
|
||||
continue
|
||||
}
|
||||
|
||||
const vmRefs = vmRefsByHost[hostId]
|
||||
|
||||
if (vmRefs === undefined) {
|
||||
continue
|
||||
}
|
||||
|
||||
// host.$resident_VMs is outdated and returns resident VMs before the host.evacuate.
|
||||
// this.getField is used in order not to get cached data.
|
||||
const residentVmRefs = await this.getField('host', host.$ref, 'resident_VMs')
|
||||
|
||||
for (const vmRef of vmRefs) {
|
||||
if (residentVmRefs.includes(vmRef)) {
|
||||
continue
|
||||
}
|
||||
|
||||
try {
|
||||
const vmId = await this.getField('VM', vmRef, 'uuid')
|
||||
await this.migrateVm(vmId, this, hostId)
|
||||
} catch (err) {
|
||||
log.error(err)
|
||||
if (error === undefined) {
|
||||
error = err
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (error !== undefined) {
|
||||
throw error
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
export default decorateObject(methods, {
|
||||
exportPoolMetadata: cancelable,
|
||||
importPoolMetadata: cancelable,
|
||||
rollingPoolReboot: deferrable,
|
||||
})
|
||||
|
@ -33,6 +33,7 @@ const AUTHORIZATIONS = {
|
||||
LIST_MISSING_PATCHES: STARTER,
|
||||
POOL_EMERGENCY_SHUTDOWN: ENTERPRISE,
|
||||
ROLLING_POOL_UPDATE: ENTERPRISE,
|
||||
ROLLING_POOL_REBOOT: ENTERPRISE,
|
||||
}
|
||||
|
||||
export default class Authorization {
|
||||
|
@ -134,4 +134,10 @@ export default class Pools {
|
||||
srsByPool[pool.id].some(sr => sr.size - sr.physical_usage >= minAvailableSrSize && checkSrName(sr.name_label))
|
||||
)
|
||||
}
|
||||
|
||||
async rollingPoolReboot(pool) {
|
||||
const { _app } = this
|
||||
await _app.checkFeatureAuthorization('ROLLING_POOL_REBOOT')
|
||||
await _app.getXapi(pool).rollingPoolReboot()
|
||||
}
|
||||
}
|
||||
|
@ -641,7 +641,7 @@ export default class XenServers {
|
||||
|
||||
async rollingPoolUpdate($defer, pool) {
|
||||
const app = this._app
|
||||
|
||||
await app.checkFeatureAuthorization('ROLLING_POOL_UPDATE')
|
||||
const [schedules, jobs] = await Promise.all([app.getAllSchedules(), app.getAllJobs('backup')])
|
||||
|
||||
const poolId = pool.id
|
||||
|
Loading…
Reference in New Issue
Block a user