backend/local: Periodically persist intermediate state snapshots

Terraform Core emits a hook event every time it writes a change into the in-memory state. Previously the local backend would just copy that into the transient storage of the state manager, but for most state storage implementations that doesn't really do anything useful because it just makes another copy of the state in memory. We originally added this hook mechanism with the intent of making Terraform _persist_ the state each time, but we backed that out after finding that it was a bit too aggressive and was making the state snapshot history much harder to use in storage systems that can preserve historical snapshots. However, sometimes Terraform gets killed mid-apply for whatever reason and in our previous implementation that meant always losing that transient state, forcing the user to edit the state manually (or use "import") to recover a useful state. In an attempt at finding a sweet spot between these extremes, here we change the rule so that if an apply runs for longer than 20 seconds then we'll try to persist the state to the backend in an update that arrives at least 20 seconds after the first update, and then again for each additional 20 second period as long as Terraform keeps announcing new state snapshots. This also introduces a special interruption mode where if the apply phase gets interrupted by SIGINT (or equivalent) then the local backend will try to persist the state immediately in anticipation of a possibly-imminent SIGKILL, and will then immediately persist any subsequent state update that arrives until the apply phase is complete. After interruption Terraform will not start any new operations and will instead just let any already-running operations run to completion, and so this will persist the state once per resource instance that is able to complete before being killed. This does mean that now long-running applies will generate intermediate state snapshots where they wouldn't before, but there should still be considerably fewer snapshots than were created when we were persisting for each individual state change. We can adjust the 20 second interval in future commits if we find that this spot isn't as sweet as first assumed.
2025-02-25 18:45:20 -06:00 · 2023-02-13 17:38:24 -08:00 · 2023-02-13 17:38:24 -08:00 · f0de9b60c1
commit f0de9b60c1
parent 47fd019dbb
10 changed files with 401 additions and 0 deletions
--- a/internal/backend/local/backend_apply.go
+++ b/internal/backend/local/backend_apply.go
@ -5,6 +5,7 @@ import (
 	"errors"
 	"fmt"
 	"log"
+	"time"

 	"github.com/hashicorp/terraform/internal/backend"
 	"github.com/hashicorp/terraform/internal/command/views"
@ -74,6 +75,10 @@ func (b *Local) opApply(
 		op.ReportResult(runningOp, diags)
 		return
 	}
+	// stateHook uses schemas for when it periodically persists state to the
+	// persistent storage backend.
+	stateHook.Schemas = schemas
+	stateHook.PersistInterval = 20 * time.Second // arbitrary interval that's hopefully a sweet spot

 	var plan *plans.Plan
 	// If we weren't given a plan, then we refresh/plan
--- a/internal/backend/local/hook_state.go
+++ b/internal/backend/local/hook_state.go
@ -1,7 +1,9 @@
 package local

 import (
+	"log"
 	"sync"
+	"time"

 	"github.com/hashicorp/terraform/internal/states"
 	"github.com/hashicorp/terraform/internal/states/statemgr"
@ -15,6 +17,21 @@ type StateHook struct {
 	sync.Mutex

 	StateMgr statemgr.Writer
+
+	// If PersistInterval is nonzero then for any new state update after
+	// the duration has elapsed we'll try to persist a state snapshot
+	// to the persistent backend too.
+	// That's only possible if field Schemas is valid, because the
+	// StateMgr.PersistState function for some backends needs schemas.
+	PersistInterval time.Duration
+
+	// Schemas are the schemas to use when persisting state due to
+	// PersistInterval. This is ignored if PersistInterval is zero,
+	// and PersistInterval is ignored if this is nil.
+	Schemas *terraform.Schemas
+
+	lastPersist  time.Time
+	forcePersist bool
 }

 var _ terraform.Hook = (*StateHook)(nil)
@ -23,11 +40,56 @@ func (h *StateHook) PostStateUpdate(new *states.State) (terraform.HookAction, er
 	h.Lock()
 	defer h.Unlock()

+	if h.lastPersist.IsZero() {
+		// The first PostStateUpdate starts the clock for intermediate
+		// calls to PersistState.
+		h.lastPersist = time.Now()
+	}
+
 	if h.StateMgr != nil {
 		if err := h.StateMgr.WriteState(new); err != nil {
 			return terraform.HookActionHalt, err
 		}
+		if mgrPersist, ok := h.StateMgr.(statemgr.Persister); ok && h.PersistInterval != 0 && h.Schemas != nil {
+			if h.forcePersist || time.Since(h.lastPersist) >= h.PersistInterval {
+				err := mgrPersist.PersistState(h.Schemas)
+				if err != nil {
+					return terraform.HookActionHalt, err
+				}
+				h.lastPersist = time.Now()
+			}
+		}
 	}

 	return terraform.HookActionContinue, nil
 }
+
+func (h *StateHook) Stopping() {
+	h.Lock()
+	defer h.Unlock()
+
+	// If Terraform has been asked to stop then that might mean that a hard
+	// kill signal will follow shortly in case Terraform doesn't stop
+	// quickly enough, and so we'll try to persist the latest state
+	// snapshot in the hope that it'll give the user less recovery work to
+	// do if they _do_ subsequently hard-kill Terraform during an apply.
+
+	if mgrPersist, ok := h.StateMgr.(statemgr.Persister); ok && h.Schemas != nil {
+		err := mgrPersist.PersistState(h.Schemas)
+		if err != nil {
+			// This hook can't affect Terraform Core's ongoing behavior,
+			// but it's a best effort thing anyway so we'll just emit a
+			// log to aid with debugging.
+			log.Printf("[ERROR] Failed to persist state after interruption: %s", err)
+		}
+
+		// While we're in the stopping phase we'll try to persist every
+		// new state update to maximize every opportunity we get to avoid
+		// losing track of objects that have been created or updated.
+		// Terraform Core won't start any new operations after it's been
+		// stopped, so at most we should see one more PostStateUpdate
+		// call per already-active request.
+		h.forcePersist = true
+	}
+
+}
--- a/internal/backend/local/hook_state_test.go
+++ b/internal/backend/local/hook_state_test.go
@ -1,8 +1,12 @@
 package local

 import (
+	"fmt"
 	"testing"
+	"time"

+	"github.com/google/go-cmp/cmp"
+	"github.com/hashicorp/terraform/internal/states"
 	"github.com/hashicorp/terraform/internal/states/statemgr"
 	"github.com/hashicorp/terraform/internal/terraform"
 )
@ -27,3 +31,125 @@ func TestStateHook(t *testing.T) {
 		t.Fatalf("bad state: %#v", is.State())
 	}
 }
+
+func TestStateHookStopping(t *testing.T) {
+	is := &testPersistentState{}
+	hook := &StateHook{
+		StateMgr:        is,
+		Schemas:         &terraform.Schemas{},
+		PersistInterval: 4 * time.Hour,
+		lastPersist:     time.Now(),
+	}
+
+	s := statemgr.TestFullInitialState()
+	action, err := hook.PostStateUpdate(s)
+	if err != nil {
+		t.Fatalf("unexpected error from PostStateUpdate: %s", err)
+	}
+	if got, want := action, terraform.HookActionContinue; got != want {
+		t.Fatalf("wrong hookaction %#v; want %#v", got, want)
+	}
+	if is.Written == nil || !is.Written.Equal(s) {
+		t.Fatalf("mismatching state written")
+	}
+	if is.Persisted != nil {
+		t.Fatalf("persisted too soon")
+	}
+
+	// We'll now force lastPersist to be long enough ago that persisting
+	// should be due on the next call.
+	hook.lastPersist = time.Now().Add(-5 * time.Hour)
+	hook.PostStateUpdate(s)
+	if is.Written == nil || !is.Written.Equal(s) {
+		t.Fatalf("mismatching state written")
+	}
+	if is.Persisted == nil || !is.Persisted.Equal(s) {
+		t.Fatalf("mismatching state persisted")
+	}
+	hook.PostStateUpdate(s)
+	if is.Written == nil || !is.Written.Equal(s) {
+		t.Fatalf("mismatching state written")
+	}
+	if is.Persisted == nil || !is.Persisted.Equal(s) {
+		t.Fatalf("mismatching state persisted")
+	}
+
+	gotLog := is.CallLog
+	wantLog := []string{
+		// Initial call before we reset lastPersist
+		"WriteState",
+
+		// Write and then persist after we reset lastPersist
+		"WriteState",
+		"PersistState",
+
+		// Final call when persisting wasn't due yet.
+		"WriteState",
+	}
+	if diff := cmp.Diff(wantLog, gotLog); diff != "" {
+		t.Fatalf("wrong call log so far\n%s", diff)
+	}
+
+	// We'll reset the log now before we try seeing what happens after
+	// we use "Stopped".
+	is.CallLog = is.CallLog[:0]
+	is.Persisted = nil
+
+	hook.Stopping()
+	if is.Persisted == nil || !is.Persisted.Equal(s) {
+		t.Fatalf("mismatching state persisted")
+	}
+
+	is.Persisted = nil
+	hook.PostStateUpdate(s)
+	if is.Persisted == nil || !is.Persisted.Equal(s) {
+		t.Fatalf("mismatching state persisted")
+	}
+	is.Persisted = nil
+	hook.PostStateUpdate(s)
+	if is.Persisted == nil || !is.Persisted.Equal(s) {
+		t.Fatalf("mismatching state persisted")
+	}
+
+	gotLog = is.CallLog
+	wantLog = []string{
+		// "Stopping" immediately persisted
+		"PersistState",
+
+		// PostStateUpdate then writes and persists on every call,
+		// on the assumption that we're now bailing out after
+		// being cancelled and trying to save as much state as we can.
+		"WriteState",
+		"PersistState",
+		"WriteState",
+		"PersistState",
+	}
+	if diff := cmp.Diff(wantLog, gotLog); diff != "" {
+		t.Fatalf("wrong call log once in stopping mode\n%s", diff)
+	}
+}
+
+type testPersistentState struct {
+	CallLog []string
+
+	Written   *states.State
+	Persisted *states.State
+}
+
+var _ statemgr.Writer = (*testPersistentState)(nil)
+var _ statemgr.Persister = (*testPersistentState)(nil)
+
+func (sm *testPersistentState) WriteState(state *states.State) error {
+	sm.CallLog = append(sm.CallLog, "WriteState")
+	sm.Written = state
+	return nil
+}
+
+func (sm *testPersistentState) PersistState(schemas *terraform.Schemas) error {
+	if schemas == nil {
+		return fmt.Errorf("no schemas")
+	}
+	sm.CallLog = append(sm.CallLog, "PersistState")
+	sm.Persisted = sm.Written
+	return nil
+}
--- a/internal/terraform/context.go
+++ b/internal/terraform/context.go
@ -193,6 +193,12 @@ func (c *Context) Stop() {
 		c.runContextCancel = nil
 	}

+	// Notify all of the hooks that we're stopping, in case they want to try
+	// to flush in-memory state to disk before a subsequent hard kill.
+	for _, hook := range c.hooks {
+		hook.Stopping()
+	}
+
 	// Grab the condition var before we exit
 	if cond := c.runCond; cond != nil {
 		log.Printf("[INFO] terraform: waiting for graceful stop to complete")
--- a/internal/terraform/context_apply_test.go
+++ b/internal/terraform/context_apply_test.go
@ -64,6 +64,148 @@ func TestContext2Apply_basic(t *testing.T) {
 	}
 }

+func TestContext2Apply_stop(t *testing.T) {
+	t.Parallel()
+
+	m := testModule(t, "apply-stop")
+	stopCh := make(chan struct{})
+	waitCh := make(chan struct{})
+	stoppedCh := make(chan struct{})
+	p := &MockProvider{
+		GetProviderSchemaResponse: &providers.GetProviderSchemaResponse{
+			ResourceTypes: map[string]providers.Schema{
+				"indefinite": {
+					Version: 1,
+					Block: &configschema.Block{
+						Attributes: map[string]*configschema.Attribute{
+							"result": {
+								Type:     cty.String,
+								Computed: true,
+							},
+						},
+					},
+				},
+			},
+		},
+		PlanResourceChangeFn: func(prcr providers.PlanResourceChangeRequest) providers.PlanResourceChangeResponse {
+			log.Printf("[TRACE] TestContext2Apply_stop: no-op PlanResourceChange")
+			return providers.PlanResourceChangeResponse{
+				PlannedState: cty.ObjectVal(map[string]cty.Value{
+					"result": cty.UnknownVal(cty.String),
+				}),
+			}
+		},
+		ApplyResourceChangeFn: func(arcr providers.ApplyResourceChangeRequest) providers.ApplyResourceChangeResponse {
+			// This will unblock the main test code once we reach this
+			// point, so that it'll then be guaranteed to call Stop
+			// while we're waiting in here.
+			close(waitCh)
+
+			log.Printf("[TRACE] TestContext2Apply_stop: ApplyResourceChange waiting for Stop call")
+			// This will block until StopFn closes this channel below.
+			<-stopCh
+			// This unblocks StopFn below, thereby acknowledging the request
+			// to stop.
+			close(stoppedCh)
+			return providers.ApplyResourceChangeResponse{
+				NewState: cty.ObjectVal(map[string]cty.Value{
+					"result": cty.StringVal("complete"),
+				}),
+			}
+		},
+		StopFn: func() error {
+			// Closing this channel will unblock the channel read in
+			// ApplyResourceChangeFn above.
+			log.Printf("[TRACE] TestContext2Apply_stop: Stop called")
+			close(stopCh)
+			// This will block until ApplyResourceChange has reacted to
+			// being stopped.
+			log.Printf("[TRACE] TestContext2Apply_stop: Waiting for ApplyResourceChange to react to being stopped")
+			<-stoppedCh
+			log.Printf("[TRACE] TestContext2Apply_stop: Stop is completing")
+			return nil
+		},
+	}
+
+	hook := &testHook{}
+	ctx := testContext2(t, &ContextOpts{
+		Hooks: []Hook{hook},
+		Providers: map[addrs.Provider]providers.Factory{
+			addrs.MustParseProviderSourceString("terraform.io/test/indefinite"): testProviderFuncFixed(p),
+		},
+	})
+
+	plan, diags := ctx.Plan(m, states.NewState(), DefaultPlanOpts)
+	assertNoErrors(t, diags)
+
+	// We'll reset the hook events before we apply because we only care about
+	// the apply-time events.
+	hook.Calls = hook.Calls[:0]
+
+	// We'll apply in the background so that we can call Stop in the foreground.
+	stateCh := make(chan *states.State)
+	go func(plan *plans.Plan) {
+		state, _ := ctx.Apply(plan, m)
+		stateCh <- state
+	}(plan)
+
+	// We'll wait until the provider signals that we've reached the
+	// ApplyResourceChange function, so we can guarantee the expected
+	// order of operations so our hook events below will always match.
+	t.Log("waiting for the apply phase to get started")
+	<-waitCh
+
+	// This will block until the apply operation has unwound, so we should
+	// be able to observe all of the apply side-effects afterwards.
+	t.Log("waiting for ctx.Stop to return")
+	ctx.Stop()
+
+	t.Log("waiting for apply goroutine to return state")
+	state := <-stateCh
+
+	t.Log("apply is all complete")
+	if state == nil {
+		t.Fatalf("final state is nil")
+	}
+
+	// Because we interrupted the apply phase while applying the resource,
+	// we should have halted immediately after we finished visiting that
+	// resource. We don't visit indefinite.bar at all.
+	gotEvents := hook.Calls
+	wantEvents := []*testHookCall{
+		{"PreDiff", "indefinite.foo"},
+		{"PostDiff", "indefinite.foo"},
+		{"PreApply", "indefinite.foo"},
+		{"Stopping", ""}, // Local backend uses this as a hint to persist the latest state snapshot
+		{"PostApply", "indefinite.foo"},
+		{"PostStateUpdate", ""}, // State gets updated one more time to include the apply result.
+	}
+	if diff := cmp.Diff(wantEvents, gotEvents); diff != "" {
+		t.Errorf("wrong hook events\n%s", diff)
+	}
+
+	rov := state.OutputValue(addrs.OutputValue{Name: "result"}.Absolute(addrs.RootModuleInstance))
+	if rov != nil && rov.Value != cty.NilVal && !rov.Value.IsNull() {
+		t.Errorf("'result' output value unexpectedly populated: %#v", rov.Value)
+	}
+
+	resourceAddr := addrs.Resource{
+		Mode: addrs.ManagedResourceMode,
+		Type: "indefinite",
+		Name: "foo",
+	}.Instance(addrs.NoKey).Absolute(addrs.RootModuleInstance)
+	rv := state.ResourceInstance(resourceAddr)
+	if rv == nil || rv.Current == nil {
+		t.Fatalf("no state entry for %s", resourceAddr)
+	}
+
+	resourceAddr.Resource.Resource.Name = "bar"
+	rv = state.ResourceInstance(resourceAddr)
+	if rv != nil && rv.Current != nil {
+		t.Fatalf("unexpected state entry for %s", resourceAddr)
+	}
+}
+
 func TestContext2Apply_unstable(t *testing.T) {
 	// This tests behavior when the configuration contains an unstable value,
 	// such as the result of uuid() or timestamp(), where each call produces
--- a/internal/terraform/hook.go
+++ b/internal/terraform/hook.go
@ -76,6 +76,22 @@ type Hook interface {
 	PreImportState(addr addrs.AbsResourceInstance, importID string) (HookAction, error)
 	PostImportState(addr addrs.AbsResourceInstance, imported []providers.ImportedResource) (HookAction, error)

+	// Stopping is called if an external signal requests that Terraform
+	// gracefully abort an operation in progress.
+	//
+	// This notification might suggest that the user wants Terraform to exit
+	// ASAP and in that case it's possible that if Terraform runs for too much
+	// longer then it'll get killed un-gracefully, and so this hook could be
+	// an opportunity to persist any transient data that would be lost under
+	// a subsequent kill signal. However, implementations must take care to do
+	// so in a way that won't cause corruption if the process _is_ killed while
+	// this hook is still running.
+	//
+	// This hook cannot control whether Terraform continues, because the
+	// graceful shutdown process is typically already running by the time this
+	// function is called.
+	Stopping()
+
 	// PostStateUpdate is called each time the state is updated. It receives
 	// a deep copy of the state, which it may therefore access freely without
 	// any need for locks to protect from concurrent writes from the caller.
@ -140,6 +156,10 @@ func (*NilHook) PostImportState(addr addrs.AbsResourceInstance, imported []provi
 	return HookActionContinue, nil
 }

+func (*NilHook) Stopping() {
+	// Does nothing at all by default
+}
+
 func (*NilHook) PostStateUpdate(new *states.State) (HookAction, error) {
 	return HookActionContinue, nil
 }
--- a/internal/terraform/hook_mock.go
+++ b/internal/terraform/hook_mock.go
@ -108,6 +108,8 @@ type MockHook struct {
 	PostImportStateReturn    HookAction
 	PostImportStateError     error

+	StoppingCalled bool
+
 	PostStateUpdateCalled bool
 	PostStateUpdateState  *states.State
 	PostStateUpdateReturn HookAction
@ -264,6 +266,13 @@ func (h *MockHook) PostImportState(addr addrs.AbsResourceInstance, imported []pr
 	return h.PostImportStateReturn, h.PostImportStateError
 }

+func (h *MockHook) Stopping() {
+	h.Lock()
+	defer h.Unlock()
+
+	h.StoppingCalled = true
+}
+
 func (h *MockHook) PostStateUpdate(new *states.State) (HookAction, error) {
 	h.Lock()
 	defer h.Unlock()
--- a/internal/terraform/hook_stop.go
+++ b/internal/terraform/hook_stop.go
@ -71,6 +71,8 @@ func (h *stopHook) PostImportState(addr addrs.AbsResourceInstance, imported []pr
 	return h.hook()
 }

+func (h *stopHook) Stopping() {}
+
 func (h *stopHook) PostStateUpdate(new *states.State) (HookAction, error) {
 	return h.hook()
 }
--- a/internal/terraform/hook_test.go
+++ b/internal/terraform/hook_test.go
@ -124,6 +124,12 @@ func (h *testHook) PostImportState(addr addrs.AbsResourceInstance, imported []pr
 	return HookActionContinue, nil
 }

+func (h *testHook) Stopping() {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+	h.Calls = append(h.Calls, &testHookCall{"Stopping", ""})
+}
+
 func (h *testHook) PostStateUpdate(new *states.State) (HookAction, error) {
 	h.mu.Lock()
 	defer h.mu.Unlock()
--- a/internal/terraform/testdata/apply-stop/apply-stop.tf
+++ b/internal/terraform/testdata/apply-stop/apply-stop.tf
@ -0,0 +1,23 @@
+terraform {
+  required_providers {
+    indefinite = {
+      source = "terraform.io/test/indefinite"
+    }
+  }
+}
+
+# The TestContext2Apply_stop test arranges for "indefinite"'s
+# ApplyResourceChange to just block indefinitely until the operation
+# is cancelled using Context.Stop.
+resource "indefinite" "foo" {
+}
+
+resource "indefinite" "bar" {
+  # Should never get here during apply because we're going to interrupt the
+  # run during indefinite.foo's ApplyResourceChange.
+  depends_on = [indefinite.foo]
+}
+
+output "result" {
+  value = indefinite.foo.result
+}