diff --git a/internal/backend/local/backend_apply.go b/internal/backend/local/backend_apply.go index 23d72fd975..f28b506019 100644 --- a/internal/backend/local/backend_apply.go +++ b/internal/backend/local/backend_apply.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "log" + "time" "github.com/hashicorp/terraform/internal/backend" "github.com/hashicorp/terraform/internal/command/views" @@ -74,6 +75,10 @@ func (b *Local) opApply( op.ReportResult(runningOp, diags) return } + // stateHook uses schemas for when it periodically persists state to the + // persistent storage backend. + stateHook.Schemas = schemas + stateHook.PersistInterval = 20 * time.Second // arbitrary interval that's hopefully a sweet spot var plan *plans.Plan // If we weren't given a plan, then we refresh/plan diff --git a/internal/backend/local/hook_state.go b/internal/backend/local/hook_state.go index 4c11496c25..241f380101 100644 --- a/internal/backend/local/hook_state.go +++ b/internal/backend/local/hook_state.go @@ -1,7 +1,9 @@ package local import ( + "log" "sync" + "time" "github.com/hashicorp/terraform/internal/states" "github.com/hashicorp/terraform/internal/states/statemgr" @@ -15,6 +17,21 @@ type StateHook struct { sync.Mutex StateMgr statemgr.Writer + + // If PersistInterval is nonzero then for any new state update after + // the duration has elapsed we'll try to persist a state snapshot + // to the persistent backend too. + // That's only possible if field Schemas is valid, because the + // StateMgr.PersistState function for some backends needs schemas. + PersistInterval time.Duration + + // Schemas are the schemas to use when persisting state due to + // PersistInterval. This is ignored if PersistInterval is zero, + // and PersistInterval is ignored if this is nil. + Schemas *terraform.Schemas + + lastPersist time.Time + forcePersist bool } var _ terraform.Hook = (*StateHook)(nil) @@ -23,11 +40,56 @@ func (h *StateHook) PostStateUpdate(new *states.State) (terraform.HookAction, er h.Lock() defer h.Unlock() + if h.lastPersist.IsZero() { + // The first PostStateUpdate starts the clock for intermediate + // calls to PersistState. + h.lastPersist = time.Now() + } + if h.StateMgr != nil { if err := h.StateMgr.WriteState(new); err != nil { return terraform.HookActionHalt, err } + if mgrPersist, ok := h.StateMgr.(statemgr.Persister); ok && h.PersistInterval != 0 && h.Schemas != nil { + if h.forcePersist || time.Since(h.lastPersist) >= h.PersistInterval { + err := mgrPersist.PersistState(h.Schemas) + if err != nil { + return terraform.HookActionHalt, err + } + h.lastPersist = time.Now() + } + } } return terraform.HookActionContinue, nil } + +func (h *StateHook) Stopping() { + h.Lock() + defer h.Unlock() + + // If Terraform has been asked to stop then that might mean that a hard + // kill signal will follow shortly in case Terraform doesn't stop + // quickly enough, and so we'll try to persist the latest state + // snapshot in the hope that it'll give the user less recovery work to + // do if they _do_ subsequently hard-kill Terraform during an apply. + + if mgrPersist, ok := h.StateMgr.(statemgr.Persister); ok && h.Schemas != nil { + err := mgrPersist.PersistState(h.Schemas) + if err != nil { + // This hook can't affect Terraform Core's ongoing behavior, + // but it's a best effort thing anyway so we'll just emit a + // log to aid with debugging. + log.Printf("[ERROR] Failed to persist state after interruption: %s", err) + } + + // While we're in the stopping phase we'll try to persist every + // new state update to maximize every opportunity we get to avoid + // losing track of objects that have been created or updated. + // Terraform Core won't start any new operations after it's been + // stopped, so at most we should see one more PostStateUpdate + // call per already-active request. + h.forcePersist = true + } + +} diff --git a/internal/backend/local/hook_state_test.go b/internal/backend/local/hook_state_test.go index 6e86ac728f..e678fc086c 100644 --- a/internal/backend/local/hook_state_test.go +++ b/internal/backend/local/hook_state_test.go @@ -1,8 +1,12 @@ package local import ( + "fmt" "testing" + "time" + "github.com/google/go-cmp/cmp" + "github.com/hashicorp/terraform/internal/states" "github.com/hashicorp/terraform/internal/states/statemgr" "github.com/hashicorp/terraform/internal/terraform" ) @@ -27,3 +31,125 @@ func TestStateHook(t *testing.T) { t.Fatalf("bad state: %#v", is.State()) } } + +func TestStateHookStopping(t *testing.T) { + is := &testPersistentState{} + hook := &StateHook{ + StateMgr: is, + Schemas: &terraform.Schemas{}, + PersistInterval: 4 * time.Hour, + lastPersist: time.Now(), + } + + s := statemgr.TestFullInitialState() + action, err := hook.PostStateUpdate(s) + if err != nil { + t.Fatalf("unexpected error from PostStateUpdate: %s", err) + } + if got, want := action, terraform.HookActionContinue; got != want { + t.Fatalf("wrong hookaction %#v; want %#v", got, want) + } + if is.Written == nil || !is.Written.Equal(s) { + t.Fatalf("mismatching state written") + } + if is.Persisted != nil { + t.Fatalf("persisted too soon") + } + + // We'll now force lastPersist to be long enough ago that persisting + // should be due on the next call. + hook.lastPersist = time.Now().Add(-5 * time.Hour) + hook.PostStateUpdate(s) + if is.Written == nil || !is.Written.Equal(s) { + t.Fatalf("mismatching state written") + } + if is.Persisted == nil || !is.Persisted.Equal(s) { + t.Fatalf("mismatching state persisted") + } + hook.PostStateUpdate(s) + if is.Written == nil || !is.Written.Equal(s) { + t.Fatalf("mismatching state written") + } + if is.Persisted == nil || !is.Persisted.Equal(s) { + t.Fatalf("mismatching state persisted") + } + + gotLog := is.CallLog + wantLog := []string{ + // Initial call before we reset lastPersist + "WriteState", + + // Write and then persist after we reset lastPersist + "WriteState", + "PersistState", + + // Final call when persisting wasn't due yet. + "WriteState", + } + if diff := cmp.Diff(wantLog, gotLog); diff != "" { + t.Fatalf("wrong call log so far\n%s", diff) + } + + // We'll reset the log now before we try seeing what happens after + // we use "Stopped". + is.CallLog = is.CallLog[:0] + is.Persisted = nil + + hook.Stopping() + if is.Persisted == nil || !is.Persisted.Equal(s) { + t.Fatalf("mismatching state persisted") + } + + is.Persisted = nil + hook.PostStateUpdate(s) + if is.Persisted == nil || !is.Persisted.Equal(s) { + t.Fatalf("mismatching state persisted") + } + is.Persisted = nil + hook.PostStateUpdate(s) + if is.Persisted == nil || !is.Persisted.Equal(s) { + t.Fatalf("mismatching state persisted") + } + + gotLog = is.CallLog + wantLog = []string{ + // "Stopping" immediately persisted + "PersistState", + + // PostStateUpdate then writes and persists on every call, + // on the assumption that we're now bailing out after + // being cancelled and trying to save as much state as we can. + "WriteState", + "PersistState", + "WriteState", + "PersistState", + } + if diff := cmp.Diff(wantLog, gotLog); diff != "" { + t.Fatalf("wrong call log once in stopping mode\n%s", diff) + } +} + +type testPersistentState struct { + CallLog []string + + Written *states.State + Persisted *states.State +} + +var _ statemgr.Writer = (*testPersistentState)(nil) +var _ statemgr.Persister = (*testPersistentState)(nil) + +func (sm *testPersistentState) WriteState(state *states.State) error { + sm.CallLog = append(sm.CallLog, "WriteState") + sm.Written = state + return nil +} + +func (sm *testPersistentState) PersistState(schemas *terraform.Schemas) error { + if schemas == nil { + return fmt.Errorf("no schemas") + } + sm.CallLog = append(sm.CallLog, "PersistState") + sm.Persisted = sm.Written + return nil +} diff --git a/internal/terraform/context.go b/internal/terraform/context.go index 18b3f4b37b..db0249fbb5 100644 --- a/internal/terraform/context.go +++ b/internal/terraform/context.go @@ -193,6 +193,12 @@ func (c *Context) Stop() { c.runContextCancel = nil } + // Notify all of the hooks that we're stopping, in case they want to try + // to flush in-memory state to disk before a subsequent hard kill. + for _, hook := range c.hooks { + hook.Stopping() + } + // Grab the condition var before we exit if cond := c.runCond; cond != nil { log.Printf("[INFO] terraform: waiting for graceful stop to complete") diff --git a/internal/terraform/context_apply_test.go b/internal/terraform/context_apply_test.go index 4f16453310..fb1a728200 100644 --- a/internal/terraform/context_apply_test.go +++ b/internal/terraform/context_apply_test.go @@ -64,6 +64,148 @@ func TestContext2Apply_basic(t *testing.T) { } } +func TestContext2Apply_stop(t *testing.T) { + t.Parallel() + + m := testModule(t, "apply-stop") + stopCh := make(chan struct{}) + waitCh := make(chan struct{}) + stoppedCh := make(chan struct{}) + p := &MockProvider{ + GetProviderSchemaResponse: &providers.GetProviderSchemaResponse{ + ResourceTypes: map[string]providers.Schema{ + "indefinite": { + Version: 1, + Block: &configschema.Block{ + Attributes: map[string]*configschema.Attribute{ + "result": { + Type: cty.String, + Computed: true, + }, + }, + }, + }, + }, + }, + PlanResourceChangeFn: func(prcr providers.PlanResourceChangeRequest) providers.PlanResourceChangeResponse { + log.Printf("[TRACE] TestContext2Apply_stop: no-op PlanResourceChange") + return providers.PlanResourceChangeResponse{ + PlannedState: cty.ObjectVal(map[string]cty.Value{ + "result": cty.UnknownVal(cty.String), + }), + } + }, + ApplyResourceChangeFn: func(arcr providers.ApplyResourceChangeRequest) providers.ApplyResourceChangeResponse { + // This will unblock the main test code once we reach this + // point, so that it'll then be guaranteed to call Stop + // while we're waiting in here. + close(waitCh) + + log.Printf("[TRACE] TestContext2Apply_stop: ApplyResourceChange waiting for Stop call") + // This will block until StopFn closes this channel below. + <-stopCh + // This unblocks StopFn below, thereby acknowledging the request + // to stop. + close(stoppedCh) + return providers.ApplyResourceChangeResponse{ + NewState: cty.ObjectVal(map[string]cty.Value{ + "result": cty.StringVal("complete"), + }), + } + }, + StopFn: func() error { + // Closing this channel will unblock the channel read in + // ApplyResourceChangeFn above. + log.Printf("[TRACE] TestContext2Apply_stop: Stop called") + close(stopCh) + // This will block until ApplyResourceChange has reacted to + // being stopped. + log.Printf("[TRACE] TestContext2Apply_stop: Waiting for ApplyResourceChange to react to being stopped") + <-stoppedCh + log.Printf("[TRACE] TestContext2Apply_stop: Stop is completing") + return nil + }, + } + + hook := &testHook{} + ctx := testContext2(t, &ContextOpts{ + Hooks: []Hook{hook}, + Providers: map[addrs.Provider]providers.Factory{ + addrs.MustParseProviderSourceString("terraform.io/test/indefinite"): testProviderFuncFixed(p), + }, + }) + + plan, diags := ctx.Plan(m, states.NewState(), DefaultPlanOpts) + assertNoErrors(t, diags) + + // We'll reset the hook events before we apply because we only care about + // the apply-time events. + hook.Calls = hook.Calls[:0] + + // We'll apply in the background so that we can call Stop in the foreground. + stateCh := make(chan *states.State) + go func(plan *plans.Plan) { + state, _ := ctx.Apply(plan, m) + stateCh <- state + }(plan) + + // We'll wait until the provider signals that we've reached the + // ApplyResourceChange function, so we can guarantee the expected + // order of operations so our hook events below will always match. + t.Log("waiting for the apply phase to get started") + <-waitCh + + // This will block until the apply operation has unwound, so we should + // be able to observe all of the apply side-effects afterwards. + t.Log("waiting for ctx.Stop to return") + ctx.Stop() + + t.Log("waiting for apply goroutine to return state") + state := <-stateCh + + t.Log("apply is all complete") + if state == nil { + t.Fatalf("final state is nil") + } + + // Because we interrupted the apply phase while applying the resource, + // we should have halted immediately after we finished visiting that + // resource. We don't visit indefinite.bar at all. + gotEvents := hook.Calls + wantEvents := []*testHookCall{ + {"PreDiff", "indefinite.foo"}, + {"PostDiff", "indefinite.foo"}, + {"PreApply", "indefinite.foo"}, + {"Stopping", ""}, // Local backend uses this as a hint to persist the latest state snapshot + {"PostApply", "indefinite.foo"}, + {"PostStateUpdate", ""}, // State gets updated one more time to include the apply result. + } + if diff := cmp.Diff(wantEvents, gotEvents); diff != "" { + t.Errorf("wrong hook events\n%s", diff) + } + + rov := state.OutputValue(addrs.OutputValue{Name: "result"}.Absolute(addrs.RootModuleInstance)) + if rov != nil && rov.Value != cty.NilVal && !rov.Value.IsNull() { + t.Errorf("'result' output value unexpectedly populated: %#v", rov.Value) + } + + resourceAddr := addrs.Resource{ + Mode: addrs.ManagedResourceMode, + Type: "indefinite", + Name: "foo", + }.Instance(addrs.NoKey).Absolute(addrs.RootModuleInstance) + rv := state.ResourceInstance(resourceAddr) + if rv == nil || rv.Current == nil { + t.Fatalf("no state entry for %s", resourceAddr) + } + + resourceAddr.Resource.Resource.Name = "bar" + rv = state.ResourceInstance(resourceAddr) + if rv != nil && rv.Current != nil { + t.Fatalf("unexpected state entry for %s", resourceAddr) + } +} + func TestContext2Apply_unstable(t *testing.T) { // This tests behavior when the configuration contains an unstable value, // such as the result of uuid() or timestamp(), where each call produces diff --git a/internal/terraform/hook.go b/internal/terraform/hook.go index 7e927e8a5d..7d91a6fd1b 100644 --- a/internal/terraform/hook.go +++ b/internal/terraform/hook.go @@ -76,6 +76,22 @@ type Hook interface { PreImportState(addr addrs.AbsResourceInstance, importID string) (HookAction, error) PostImportState(addr addrs.AbsResourceInstance, imported []providers.ImportedResource) (HookAction, error) + // Stopping is called if an external signal requests that Terraform + // gracefully abort an operation in progress. + // + // This notification might suggest that the user wants Terraform to exit + // ASAP and in that case it's possible that if Terraform runs for too much + // longer then it'll get killed un-gracefully, and so this hook could be + // an opportunity to persist any transient data that would be lost under + // a subsequent kill signal. However, implementations must take care to do + // so in a way that won't cause corruption if the process _is_ killed while + // this hook is still running. + // + // This hook cannot control whether Terraform continues, because the + // graceful shutdown process is typically already running by the time this + // function is called. + Stopping() + // PostStateUpdate is called each time the state is updated. It receives // a deep copy of the state, which it may therefore access freely without // any need for locks to protect from concurrent writes from the caller. @@ -140,6 +156,10 @@ func (*NilHook) PostImportState(addr addrs.AbsResourceInstance, imported []provi return HookActionContinue, nil } +func (*NilHook) Stopping() { + // Does nothing at all by default +} + func (*NilHook) PostStateUpdate(new *states.State) (HookAction, error) { return HookActionContinue, nil } diff --git a/internal/terraform/hook_mock.go b/internal/terraform/hook_mock.go index 0511a57805..8808cd3421 100644 --- a/internal/terraform/hook_mock.go +++ b/internal/terraform/hook_mock.go @@ -108,6 +108,8 @@ type MockHook struct { PostImportStateReturn HookAction PostImportStateError error + StoppingCalled bool + PostStateUpdateCalled bool PostStateUpdateState *states.State PostStateUpdateReturn HookAction @@ -264,6 +266,13 @@ func (h *MockHook) PostImportState(addr addrs.AbsResourceInstance, imported []pr return h.PostImportStateReturn, h.PostImportStateError } +func (h *MockHook) Stopping() { + h.Lock() + defer h.Unlock() + + h.StoppingCalled = true +} + func (h *MockHook) PostStateUpdate(new *states.State) (HookAction, error) { h.Lock() defer h.Unlock() diff --git a/internal/terraform/hook_stop.go b/internal/terraform/hook_stop.go index 2d4144e560..7c2488cc5c 100644 --- a/internal/terraform/hook_stop.go +++ b/internal/terraform/hook_stop.go @@ -71,6 +71,8 @@ func (h *stopHook) PostImportState(addr addrs.AbsResourceInstance, imported []pr return h.hook() } +func (h *stopHook) Stopping() {} + func (h *stopHook) PostStateUpdate(new *states.State) (HookAction, error) { return h.hook() } diff --git a/internal/terraform/hook_test.go b/internal/terraform/hook_test.go index 0d5267dafa..ba8ecaf882 100644 --- a/internal/terraform/hook_test.go +++ b/internal/terraform/hook_test.go @@ -124,6 +124,12 @@ func (h *testHook) PostImportState(addr addrs.AbsResourceInstance, imported []pr return HookActionContinue, nil } +func (h *testHook) Stopping() { + h.mu.Lock() + defer h.mu.Unlock() + h.Calls = append(h.Calls, &testHookCall{"Stopping", ""}) +} + func (h *testHook) PostStateUpdate(new *states.State) (HookAction, error) { h.mu.Lock() defer h.mu.Unlock() diff --git a/internal/terraform/testdata/apply-stop/apply-stop.tf b/internal/terraform/testdata/apply-stop/apply-stop.tf new file mode 100644 index 0000000000..003ca67b3b --- /dev/null +++ b/internal/terraform/testdata/apply-stop/apply-stop.tf @@ -0,0 +1,23 @@ +terraform { + required_providers { + indefinite = { + source = "terraform.io/test/indefinite" + } + } +} + +# The TestContext2Apply_stop test arranges for "indefinite"'s +# ApplyResourceChange to just block indefinitely until the operation +# is cancelled using Context.Stop. +resource "indefinite" "foo" { +} + +resource "indefinite" "bar" { + # Should never get here during apply because we're going to interrupt the + # run during indefinite.foo's ApplyResourceChange. + depends_on = [indefinite.foo] +} + +output "result" { + value = indefinite.foo.result +}