MM-16378: revert plugin health check config removal (#11819)

* revert 4e5f6fcfbc

This reverts the configuration setting to disable plugin health checks, preferring instead to retain this functionality for advanced cases. In part, this was driven by the discovery that the health checks were failing on Windows, though that will be addressed separately.

Fixes: MM-16378

* simplify interval handling/logging

Also shutdown the job when plugins are altogether disabled.
This commit is contained in:
Jesse Hallam
2019-08-12 18:48:02 -03:00
committed by GitHub
parent 1135e42ac0
commit 9ce52fb1da
6 changed files with 44 additions and 19 deletions

View File

@@ -597,6 +597,7 @@ func (a *App) trackConfig() {
"enable": *cfg.PluginSettings.Enable,
"enable_uploads": *cfg.PluginSettings.EnableUploads,
"allow_insecure_download_url": *cfg.PluginSettings.AllowInsecureDownloadUrl,
"enable_health_check": *cfg.PluginSettings.EnableHealthCheck,
})
a.SendDiagnostic(TRACK_CONFIG_DATA_RETENTION, map[string]interface{}{

View File

@@ -204,8 +204,14 @@ func NewServer(options ...Option) (*Server, error) {
// Start plugin health check job
pluginsEnvironment := s.PluginsEnvironment
if pluginsEnvironment != nil {
pluginsEnvironment.InitPluginHealthCheckJob()
pluginsEnvironment.InitPluginHealthCheckJob(*s.Config().PluginSettings.Enable && *s.Config().PluginSettings.EnableHealthCheck)
}
s.AddConfigListener(func(_, c *model.Config) {
pluginsEnvironment := s.PluginsEnvironment
if pluginsEnvironment != nil {
pluginsEnvironment.InitPluginHealthCheckJob(*s.Config().PluginSettings.Enable && *c.PluginSettings.EnableHealthCheck)
}
})
mlog.Info(fmt.Sprintf("Current version is %v (%v/%v/%v/%v)", model.CurrentVersion, model.BuildNumber, model.BuildDate, model.BuildHash, model.BuildHashEnterprise))
mlog.Info(fmt.Sprintf("Enterprise Enabled: %v", model.BuildEnterpriseReady))

View File

@@ -34,6 +34,7 @@ var String = zap.String
var Any = zap.Any
var Err = zap.Error
var Bool = zap.Bool
var Duration = zap.Duration
type LoggerConfiguration struct {
EnableConsole bool

View File

@@ -2187,6 +2187,7 @@ type PluginSettings struct {
Enable *bool
EnableUploads *bool `restricted:"true"`
AllowInsecureDownloadUrl *bool `restricted:"true"`
EnableHealthCheck *bool `restricted:"true"`
Directory *string `restricted:"true"`
ClientDirectory *string `restricted:"true"`
Plugins map[string]map[string]interface{}
@@ -2206,6 +2207,10 @@ func (s *PluginSettings) SetDefaults(ls LogSettings) {
s.AllowInsecureDownloadUrl = NewBool(false)
}
if s.EnableHealthCheck == nil {
s.EnableHealthCheck = NewBool(true)
}
if s.Directory == nil {
s.Directory = NewString(PLUGIN_SETTINGS_DEFAULT_DIRECTORY)
}

View File

@@ -12,10 +12,10 @@ import (
)
const (
HEALTH_CHECK_INTERVAL = 30 // seconds. How often the health check should run
HEALTH_CHECK_DISABLE_DURATION = 60 // minutes. How long we wait for num fails to incur before disabling the plugin
HEALTH_CHECK_PING_FAIL_LIMIT = 3 // How many times we call RPC ping in a row before it is considered a failure
HEALTH_CHECK_RESTART_LIMIT = 3 // How many times we restart a plugin before we disable it
HEALTH_CHECK_INTERVAL = 30 * time.Second // How often the health check should run
HEALTH_CHECK_DISABLE_DURATION = 60 * time.Minute // How long we wait for num fails to incur before disabling the plugin
HEALTH_CHECK_PING_FAIL_LIMIT = 3 // How many times we call RPC ping in a row before it is considered a failure
HEALTH_CHECK_RESTART_LIMIT = 3 // How many times we restart a plugin before we disable it
)
type PluginHealthCheckJob struct {
@@ -24,22 +24,34 @@ type PluginHealthCheckJob struct {
env *Environment
}
// InitPluginHealthCheckJob starts a new job for checking all active plugins
func (env *Environment) InitPluginHealthCheckJob() {
job := newPluginHealthCheckJob(env)
env.pluginHealthCheckJob = job
job.Start()
// InitPluginHealthCheckJob starts a new job if one is not running and is set to enabled, or kills an existing one if set to disabled.
func (env *Environment) InitPluginHealthCheckJob(enable bool) {
// Config is set to enable. No job exists, start a new job.
if enable && env.pluginHealthCheckJob == nil {
mlog.Debug("Enabling plugin health check job", mlog.Duration("interval_s", HEALTH_CHECK_INTERVAL))
job := newPluginHealthCheckJob(env)
env.pluginHealthCheckJob = job
job.Start()
}
// Config is set to disable. Job exists, kill existing job.
if !enable && env.pluginHealthCheckJob != nil {
mlog.Debug("Disabling plugin health check job")
env.pluginHealthCheckJob.Cancel()
env.pluginHealthCheckJob = nil
}
}
// Start continuously runs health checks on all active plugins, on a timer.
func (job *PluginHealthCheckJob) Start() {
interval := time.Duration(HEALTH_CHECK_INTERVAL) * time.Second
mlog.Debug(fmt.Sprintf("Plugin health check job starting. Sending health check pings every %v seconds.", interval))
mlog.Debug("Plugin health check job starting.")
go func() {
defer close(job.cancelled)
ticker := time.NewTicker(interval)
ticker := time.NewTicker(HEALTH_CHECK_INTERVAL)
defer func() {
ticker.Stop()
}()
@@ -125,7 +137,7 @@ func shouldDeactivatePlugin(rp *registeredPlugin) bool {
index := len(rp.failTimeStamps) - HEALTH_CHECK_RESTART_LIMIT
t := rp.failTimeStamps[index]
now := time.Now()
elapsed := now.Sub(t).Minutes()
elapsed := now.Sub(t)
if elapsed <= HEALTH_CHECK_DISABLE_DURATION {
return true
}

View File

@@ -130,8 +130,8 @@ func TestShouldDeactivatePlugin(t *testing.T) {
// Failures are recent enough to restart
rp = newRegisteredPlugin(bundle)
rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION*0.2*time.Minute))
rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION*0.1*time.Minute))
rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION/10*2))
rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION/10))
rp.failTimeStamps = append(rp.failTimeStamps, now)
result = shouldDeactivatePlugin(rp)
@@ -139,8 +139,8 @@ func TestShouldDeactivatePlugin(t *testing.T) {
// Failures are too spaced out to warrant a restart
rp = newRegisteredPlugin(bundle)
rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION*2*time.Minute))
rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION*1*time.Minute))
rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION*2))
rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION*1))
rp.failTimeStamps = append(rp.failTimeStamps, now)
result = shouldDeactivatePlugin(rp)
@@ -148,7 +148,7 @@ func TestShouldDeactivatePlugin(t *testing.T) {
// Not enough failures are present to warrant a restart
rp = newRegisteredPlugin(bundle)
rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION*0.1*time.Minute))
rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION/10))
rp.failTimeStamps = append(rp.failTimeStamps, now)
result = shouldDeactivatePlugin(rp)