From 9ce52fb1dabe8389c43f9616524a92f09c7ef3a7 Mon Sep 17 00:00:00 2001 From: Jesse Hallam Date: Mon, 12 Aug 2019 18:48:02 -0300 Subject: [PATCH] MM-16378: revert plugin health check config removal (#11819) * revert 4e5f6fcfbc85ea24788241c716862e1833c6f60e This reverts the configuration setting to disable plugin health checks, preferring instead to retain this functionality for advanced cases. In part, this was driven by the discovery that the health checks were failing on Windows, though that will be addressed separately. Fixes: MM-16378 * simplify interval handling/logging Also shutdown the job when plugins are altogether disabled. --- app/diagnostics.go | 1 + app/server.go | 8 +++++++- mlog/log.go | 1 + model/config.go | 5 +++++ plugin/health_check.go | 38 ++++++++++++++++++++++++------------- plugin/health_check_test.go | 10 +++++----- 6 files changed, 44 insertions(+), 19 deletions(-) diff --git a/app/diagnostics.go b/app/diagnostics.go index 688e7646f8..c0ddd871fc 100644 --- a/app/diagnostics.go +++ b/app/diagnostics.go @@ -597,6 +597,7 @@ func (a *App) trackConfig() { "enable": *cfg.PluginSettings.Enable, "enable_uploads": *cfg.PluginSettings.EnableUploads, "allow_insecure_download_url": *cfg.PluginSettings.AllowInsecureDownloadUrl, + "enable_health_check": *cfg.PluginSettings.EnableHealthCheck, }) a.SendDiagnostic(TRACK_CONFIG_DATA_RETENTION, map[string]interface{}{ diff --git a/app/server.go b/app/server.go index 3850b47793..7a832530ed 100644 --- a/app/server.go +++ b/app/server.go @@ -204,8 +204,14 @@ func NewServer(options ...Option) (*Server, error) { // Start plugin health check job pluginsEnvironment := s.PluginsEnvironment if pluginsEnvironment != nil { - pluginsEnvironment.InitPluginHealthCheckJob() + pluginsEnvironment.InitPluginHealthCheckJob(*s.Config().PluginSettings.Enable && *s.Config().PluginSettings.EnableHealthCheck) } + s.AddConfigListener(func(_, c *model.Config) { + pluginsEnvironment := s.PluginsEnvironment + if pluginsEnvironment != nil { + pluginsEnvironment.InitPluginHealthCheckJob(*s.Config().PluginSettings.Enable && *c.PluginSettings.EnableHealthCheck) + } + }) mlog.Info(fmt.Sprintf("Current version is %v (%v/%v/%v/%v)", model.CurrentVersion, model.BuildNumber, model.BuildDate, model.BuildHash, model.BuildHashEnterprise)) mlog.Info(fmt.Sprintf("Enterprise Enabled: %v", model.BuildEnterpriseReady)) diff --git a/mlog/log.go b/mlog/log.go index a2945046fe..503b140ed6 100644 --- a/mlog/log.go +++ b/mlog/log.go @@ -34,6 +34,7 @@ var String = zap.String var Any = zap.Any var Err = zap.Error var Bool = zap.Bool +var Duration = zap.Duration type LoggerConfiguration struct { EnableConsole bool diff --git a/model/config.go b/model/config.go index 0a86c60bcb..b3c5393fc6 100644 --- a/model/config.go +++ b/model/config.go @@ -2187,6 +2187,7 @@ type PluginSettings struct { Enable *bool EnableUploads *bool `restricted:"true"` AllowInsecureDownloadUrl *bool `restricted:"true"` + EnableHealthCheck *bool `restricted:"true"` Directory *string `restricted:"true"` ClientDirectory *string `restricted:"true"` Plugins map[string]map[string]interface{} @@ -2206,6 +2207,10 @@ func (s *PluginSettings) SetDefaults(ls LogSettings) { s.AllowInsecureDownloadUrl = NewBool(false) } + if s.EnableHealthCheck == nil { + s.EnableHealthCheck = NewBool(true) + } + if s.Directory == nil { s.Directory = NewString(PLUGIN_SETTINGS_DEFAULT_DIRECTORY) } diff --git a/plugin/health_check.go b/plugin/health_check.go index 173d7bfb81..7f9855e125 100644 --- a/plugin/health_check.go +++ b/plugin/health_check.go @@ -12,10 +12,10 @@ import ( ) const ( - HEALTH_CHECK_INTERVAL = 30 // seconds. How often the health check should run - HEALTH_CHECK_DISABLE_DURATION = 60 // minutes. How long we wait for num fails to incur before disabling the plugin - HEALTH_CHECK_PING_FAIL_LIMIT = 3 // How many times we call RPC ping in a row before it is considered a failure - HEALTH_CHECK_RESTART_LIMIT = 3 // How many times we restart a plugin before we disable it + HEALTH_CHECK_INTERVAL = 30 * time.Second // How often the health check should run + HEALTH_CHECK_DISABLE_DURATION = 60 * time.Minute // How long we wait for num fails to incur before disabling the plugin + HEALTH_CHECK_PING_FAIL_LIMIT = 3 // How many times we call RPC ping in a row before it is considered a failure + HEALTH_CHECK_RESTART_LIMIT = 3 // How many times we restart a plugin before we disable it ) type PluginHealthCheckJob struct { @@ -24,22 +24,34 @@ type PluginHealthCheckJob struct { env *Environment } -// InitPluginHealthCheckJob starts a new job for checking all active plugins -func (env *Environment) InitPluginHealthCheckJob() { - job := newPluginHealthCheckJob(env) - env.pluginHealthCheckJob = job - job.Start() +// InitPluginHealthCheckJob starts a new job if one is not running and is set to enabled, or kills an existing one if set to disabled. +func (env *Environment) InitPluginHealthCheckJob(enable bool) { + // Config is set to enable. No job exists, start a new job. + if enable && env.pluginHealthCheckJob == nil { + mlog.Debug("Enabling plugin health check job", mlog.Duration("interval_s", HEALTH_CHECK_INTERVAL)) + + job := newPluginHealthCheckJob(env) + env.pluginHealthCheckJob = job + job.Start() + } + + // Config is set to disable. Job exists, kill existing job. + if !enable && env.pluginHealthCheckJob != nil { + mlog.Debug("Disabling plugin health check job") + + env.pluginHealthCheckJob.Cancel() + env.pluginHealthCheckJob = nil + } } // Start continuously runs health checks on all active plugins, on a timer. func (job *PluginHealthCheckJob) Start() { - interval := time.Duration(HEALTH_CHECK_INTERVAL) * time.Second - mlog.Debug(fmt.Sprintf("Plugin health check job starting. Sending health check pings every %v seconds.", interval)) + mlog.Debug("Plugin health check job starting.") go func() { defer close(job.cancelled) - ticker := time.NewTicker(interval) + ticker := time.NewTicker(HEALTH_CHECK_INTERVAL) defer func() { ticker.Stop() }() @@ -125,7 +137,7 @@ func shouldDeactivatePlugin(rp *registeredPlugin) bool { index := len(rp.failTimeStamps) - HEALTH_CHECK_RESTART_LIMIT t := rp.failTimeStamps[index] now := time.Now() - elapsed := now.Sub(t).Minutes() + elapsed := now.Sub(t) if elapsed <= HEALTH_CHECK_DISABLE_DURATION { return true } diff --git a/plugin/health_check_test.go b/plugin/health_check_test.go index dec9c59750..5e01fc2fba 100644 --- a/plugin/health_check_test.go +++ b/plugin/health_check_test.go @@ -130,8 +130,8 @@ func TestShouldDeactivatePlugin(t *testing.T) { // Failures are recent enough to restart rp = newRegisteredPlugin(bundle) - rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION*0.2*time.Minute)) - rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION*0.1*time.Minute)) + rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION/10*2)) + rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION/10)) rp.failTimeStamps = append(rp.failTimeStamps, now) result = shouldDeactivatePlugin(rp) @@ -139,8 +139,8 @@ func TestShouldDeactivatePlugin(t *testing.T) { // Failures are too spaced out to warrant a restart rp = newRegisteredPlugin(bundle) - rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION*2*time.Minute)) - rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION*1*time.Minute)) + rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION*2)) + rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION*1)) rp.failTimeStamps = append(rp.failTimeStamps, now) result = shouldDeactivatePlugin(rp) @@ -148,7 +148,7 @@ func TestShouldDeactivatePlugin(t *testing.T) { // Not enough failures are present to warrant a restart rp = newRegisteredPlugin(bundle) - rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION*0.1*time.Minute)) + rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION/10)) rp.failTimeStamps = append(rp.failTimeStamps, now) result = shouldDeactivatePlugin(rp)