diff --git a/app/diagnostics.go b/app/diagnostics.go index 688e7646f8..c0ddd871fc 100644 --- a/app/diagnostics.go +++ b/app/diagnostics.go @@ -597,6 +597,7 @@ func (a *App) trackConfig() { "enable": *cfg.PluginSettings.Enable, "enable_uploads": *cfg.PluginSettings.EnableUploads, "allow_insecure_download_url": *cfg.PluginSettings.AllowInsecureDownloadUrl, + "enable_health_check": *cfg.PluginSettings.EnableHealthCheck, }) a.SendDiagnostic(TRACK_CONFIG_DATA_RETENTION, map[string]interface{}{ diff --git a/app/server.go b/app/server.go index 3850b47793..7a832530ed 100644 --- a/app/server.go +++ b/app/server.go @@ -204,8 +204,14 @@ func NewServer(options ...Option) (*Server, error) { // Start plugin health check job pluginsEnvironment := s.PluginsEnvironment if pluginsEnvironment != nil { - pluginsEnvironment.InitPluginHealthCheckJob() + pluginsEnvironment.InitPluginHealthCheckJob(*s.Config().PluginSettings.Enable && *s.Config().PluginSettings.EnableHealthCheck) } + s.AddConfigListener(func(_, c *model.Config) { + pluginsEnvironment := s.PluginsEnvironment + if pluginsEnvironment != nil { + pluginsEnvironment.InitPluginHealthCheckJob(*s.Config().PluginSettings.Enable && *c.PluginSettings.EnableHealthCheck) + } + }) mlog.Info(fmt.Sprintf("Current version is %v (%v/%v/%v/%v)", model.CurrentVersion, model.BuildNumber, model.BuildDate, model.BuildHash, model.BuildHashEnterprise)) mlog.Info(fmt.Sprintf("Enterprise Enabled: %v", model.BuildEnterpriseReady)) diff --git a/mlog/log.go b/mlog/log.go index a2945046fe..503b140ed6 100644 --- a/mlog/log.go +++ b/mlog/log.go @@ -34,6 +34,7 @@ var String = zap.String var Any = zap.Any var Err = zap.Error var Bool = zap.Bool +var Duration = zap.Duration type LoggerConfiguration struct { EnableConsole bool diff --git a/model/config.go b/model/config.go index 0a86c60bcb..b3c5393fc6 100644 --- a/model/config.go +++ b/model/config.go @@ -2187,6 +2187,7 @@ type PluginSettings struct { Enable *bool EnableUploads *bool `restricted:"true"` AllowInsecureDownloadUrl *bool `restricted:"true"` + EnableHealthCheck *bool `restricted:"true"` Directory *string `restricted:"true"` ClientDirectory *string `restricted:"true"` Plugins map[string]map[string]interface{} @@ -2206,6 +2207,10 @@ func (s *PluginSettings) SetDefaults(ls LogSettings) { s.AllowInsecureDownloadUrl = NewBool(false) } + if s.EnableHealthCheck == nil { + s.EnableHealthCheck = NewBool(true) + } + if s.Directory == nil { s.Directory = NewString(PLUGIN_SETTINGS_DEFAULT_DIRECTORY) } diff --git a/plugin/health_check.go b/plugin/health_check.go index 173d7bfb81..7f9855e125 100644 --- a/plugin/health_check.go +++ b/plugin/health_check.go @@ -12,10 +12,10 @@ import ( ) const ( - HEALTH_CHECK_INTERVAL = 30 // seconds. How often the health check should run - HEALTH_CHECK_DISABLE_DURATION = 60 // minutes. How long we wait for num fails to incur before disabling the plugin - HEALTH_CHECK_PING_FAIL_LIMIT = 3 // How many times we call RPC ping in a row before it is considered a failure - HEALTH_CHECK_RESTART_LIMIT = 3 // How many times we restart a plugin before we disable it + HEALTH_CHECK_INTERVAL = 30 * time.Second // How often the health check should run + HEALTH_CHECK_DISABLE_DURATION = 60 * time.Minute // How long we wait for num fails to incur before disabling the plugin + HEALTH_CHECK_PING_FAIL_LIMIT = 3 // How many times we call RPC ping in a row before it is considered a failure + HEALTH_CHECK_RESTART_LIMIT = 3 // How many times we restart a plugin before we disable it ) type PluginHealthCheckJob struct { @@ -24,22 +24,34 @@ type PluginHealthCheckJob struct { env *Environment } -// InitPluginHealthCheckJob starts a new job for checking all active plugins -func (env *Environment) InitPluginHealthCheckJob() { - job := newPluginHealthCheckJob(env) - env.pluginHealthCheckJob = job - job.Start() +// InitPluginHealthCheckJob starts a new job if one is not running and is set to enabled, or kills an existing one if set to disabled. +func (env *Environment) InitPluginHealthCheckJob(enable bool) { + // Config is set to enable. No job exists, start a new job. + if enable && env.pluginHealthCheckJob == nil { + mlog.Debug("Enabling plugin health check job", mlog.Duration("interval_s", HEALTH_CHECK_INTERVAL)) + + job := newPluginHealthCheckJob(env) + env.pluginHealthCheckJob = job + job.Start() + } + + // Config is set to disable. Job exists, kill existing job. + if !enable && env.pluginHealthCheckJob != nil { + mlog.Debug("Disabling plugin health check job") + + env.pluginHealthCheckJob.Cancel() + env.pluginHealthCheckJob = nil + } } // Start continuously runs health checks on all active plugins, on a timer. func (job *PluginHealthCheckJob) Start() { - interval := time.Duration(HEALTH_CHECK_INTERVAL) * time.Second - mlog.Debug(fmt.Sprintf("Plugin health check job starting. Sending health check pings every %v seconds.", interval)) + mlog.Debug("Plugin health check job starting.") go func() { defer close(job.cancelled) - ticker := time.NewTicker(interval) + ticker := time.NewTicker(HEALTH_CHECK_INTERVAL) defer func() { ticker.Stop() }() @@ -125,7 +137,7 @@ func shouldDeactivatePlugin(rp *registeredPlugin) bool { index := len(rp.failTimeStamps) - HEALTH_CHECK_RESTART_LIMIT t := rp.failTimeStamps[index] now := time.Now() - elapsed := now.Sub(t).Minutes() + elapsed := now.Sub(t) if elapsed <= HEALTH_CHECK_DISABLE_DURATION { return true } diff --git a/plugin/health_check_test.go b/plugin/health_check_test.go index dec9c59750..5e01fc2fba 100644 --- a/plugin/health_check_test.go +++ b/plugin/health_check_test.go @@ -130,8 +130,8 @@ func TestShouldDeactivatePlugin(t *testing.T) { // Failures are recent enough to restart rp = newRegisteredPlugin(bundle) - rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION*0.2*time.Minute)) - rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION*0.1*time.Minute)) + rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION/10*2)) + rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION/10)) rp.failTimeStamps = append(rp.failTimeStamps, now) result = shouldDeactivatePlugin(rp) @@ -139,8 +139,8 @@ func TestShouldDeactivatePlugin(t *testing.T) { // Failures are too spaced out to warrant a restart rp = newRegisteredPlugin(bundle) - rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION*2*time.Minute)) - rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION*1*time.Minute)) + rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION*2)) + rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION*1)) rp.failTimeStamps = append(rp.failTimeStamps, now) result = shouldDeactivatePlugin(rp) @@ -148,7 +148,7 @@ func TestShouldDeactivatePlugin(t *testing.T) { // Not enough failures are present to warrant a restart rp = newRegisteredPlugin(bundle) - rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION*0.1*time.Minute)) + rp.failTimeStamps = append(rp.failTimeStamps, now.Add(-HEALTH_CHECK_DISABLE_DURATION/10)) rp.failTimeStamps = append(rp.failTimeStamps, now) result = shouldDeactivatePlugin(rp)