backend/local: Periodically persist intermediate state snapshots

Terraform Core emits a hook event every time it writes a change into the
in-memory state. Previously the local backend would just copy that into
the transient storage of the state manager, but for most state storage
implementations that doesn't really do anything useful because it just
makes another copy of the state in memory.

We originally added this hook mechanism with the intent of making
Terraform _persist_ the state each time, but we backed that out after
finding that it was a bit too aggressive and was making the state snapshot
history much harder to use in storage systems that can preserve historical
snapshots.

However, sometimes Terraform gets killed mid-apply for whatever reason and
in our previous implementation that meant always losing that transient
state, forcing the user to edit the state manually (or use "import") to
recover a useful state.

In an attempt at finding a sweet spot between these extremes, here we
change the rule so that if an apply runs for longer than 20 seconds then
we'll try to persist the state to the backend in an update that arrives
at least 20 seconds after the first update, and then again for each
additional 20 second period as long as Terraform keeps announcing new
state snapshots.

This also introduces a special interruption mode where if the apply phase
gets interrupted by SIGINT (or equivalent) then the local backend will
try to persist the state immediately in anticipation of a
possibly-imminent SIGKILL, and will then immediately persist any
subsequent state update that arrives until the apply phase is complete.
After interruption Terraform will not start any new operations and will
instead just let any already-running operations run to completion, and so
this will persist the state once per resource instance that is able to
complete before being killed.

This does mean that now long-running applies will generate intermediate
state snapshots where they wouldn't before, but there should still be
considerably fewer snapshots than were created when we were persisting
for each individual state change. We can adjust the 20 second interval
in future commits if we find that this spot isn't as sweet as first
assumed.
This commit is contained in:
Martin Atkins 2023-02-13 17:38:24 -08:00
parent 47fd019dbb
commit f0de9b60c1
10 changed files with 401 additions and 0 deletions

View File

@ -5,6 +5,7 @@ import (
"errors"
"fmt"
"log"
"time"
"github.com/hashicorp/terraform/internal/backend"
"github.com/hashicorp/terraform/internal/command/views"
@ -74,6 +75,10 @@ func (b *Local) opApply(
op.ReportResult(runningOp, diags)
return
}
// stateHook uses schemas for when it periodically persists state to the
// persistent storage backend.
stateHook.Schemas = schemas
stateHook.PersistInterval = 20 * time.Second // arbitrary interval that's hopefully a sweet spot
var plan *plans.Plan
// If we weren't given a plan, then we refresh/plan

View File

@ -1,7 +1,9 @@
package local
import (
"log"
"sync"
"time"
"github.com/hashicorp/terraform/internal/states"
"github.com/hashicorp/terraform/internal/states/statemgr"
@ -15,6 +17,21 @@ type StateHook struct {
sync.Mutex
StateMgr statemgr.Writer
// If PersistInterval is nonzero then for any new state update after
// the duration has elapsed we'll try to persist a state snapshot
// to the persistent backend too.
// That's only possible if field Schemas is valid, because the
// StateMgr.PersistState function for some backends needs schemas.
PersistInterval time.Duration
// Schemas are the schemas to use when persisting state due to
// PersistInterval. This is ignored if PersistInterval is zero,
// and PersistInterval is ignored if this is nil.
Schemas *terraform.Schemas
lastPersist time.Time
forcePersist bool
}
var _ terraform.Hook = (*StateHook)(nil)
@ -23,11 +40,56 @@ func (h *StateHook) PostStateUpdate(new *states.State) (terraform.HookAction, er
h.Lock()
defer h.Unlock()
if h.lastPersist.IsZero() {
// The first PostStateUpdate starts the clock for intermediate
// calls to PersistState.
h.lastPersist = time.Now()
}
if h.StateMgr != nil {
if err := h.StateMgr.WriteState(new); err != nil {
return terraform.HookActionHalt, err
}
if mgrPersist, ok := h.StateMgr.(statemgr.Persister); ok && h.PersistInterval != 0 && h.Schemas != nil {
if h.forcePersist || time.Since(h.lastPersist) >= h.PersistInterval {
err := mgrPersist.PersistState(h.Schemas)
if err != nil {
return terraform.HookActionHalt, err
}
h.lastPersist = time.Now()
}
}
}
return terraform.HookActionContinue, nil
}
func (h *StateHook) Stopping() {
h.Lock()
defer h.Unlock()
// If Terraform has been asked to stop then that might mean that a hard
// kill signal will follow shortly in case Terraform doesn't stop
// quickly enough, and so we'll try to persist the latest state
// snapshot in the hope that it'll give the user less recovery work to
// do if they _do_ subsequently hard-kill Terraform during an apply.
if mgrPersist, ok := h.StateMgr.(statemgr.Persister); ok && h.Schemas != nil {
err := mgrPersist.PersistState(h.Schemas)
if err != nil {
// This hook can't affect Terraform Core's ongoing behavior,
// but it's a best effort thing anyway so we'll just emit a
// log to aid with debugging.
log.Printf("[ERROR] Failed to persist state after interruption: %s", err)
}
// While we're in the stopping phase we'll try to persist every
// new state update to maximize every opportunity we get to avoid
// losing track of objects that have been created or updated.
// Terraform Core won't start any new operations after it's been
// stopped, so at most we should see one more PostStateUpdate
// call per already-active request.
h.forcePersist = true
}
}

View File

@ -1,8 +1,12 @@
package local
import (
"fmt"
"testing"
"time"
"github.com/google/go-cmp/cmp"
"github.com/hashicorp/terraform/internal/states"
"github.com/hashicorp/terraform/internal/states/statemgr"
"github.com/hashicorp/terraform/internal/terraform"
)
@ -27,3 +31,125 @@ func TestStateHook(t *testing.T) {
t.Fatalf("bad state: %#v", is.State())
}
}
func TestStateHookStopping(t *testing.T) {
is := &testPersistentState{}
hook := &StateHook{
StateMgr: is,
Schemas: &terraform.Schemas{},
PersistInterval: 4 * time.Hour,
lastPersist: time.Now(),
}
s := statemgr.TestFullInitialState()
action, err := hook.PostStateUpdate(s)
if err != nil {
t.Fatalf("unexpected error from PostStateUpdate: %s", err)
}
if got, want := action, terraform.HookActionContinue; got != want {
t.Fatalf("wrong hookaction %#v; want %#v", got, want)
}
if is.Written == nil || !is.Written.Equal(s) {
t.Fatalf("mismatching state written")
}
if is.Persisted != nil {
t.Fatalf("persisted too soon")
}
// We'll now force lastPersist to be long enough ago that persisting
// should be due on the next call.
hook.lastPersist = time.Now().Add(-5 * time.Hour)
hook.PostStateUpdate(s)
if is.Written == nil || !is.Written.Equal(s) {
t.Fatalf("mismatching state written")
}
if is.Persisted == nil || !is.Persisted.Equal(s) {
t.Fatalf("mismatching state persisted")
}
hook.PostStateUpdate(s)
if is.Written == nil || !is.Written.Equal(s) {
t.Fatalf("mismatching state written")
}
if is.Persisted == nil || !is.Persisted.Equal(s) {
t.Fatalf("mismatching state persisted")
}
gotLog := is.CallLog
wantLog := []string{
// Initial call before we reset lastPersist
"WriteState",
// Write and then persist after we reset lastPersist
"WriteState",
"PersistState",
// Final call when persisting wasn't due yet.
"WriteState",
}
if diff := cmp.Diff(wantLog, gotLog); diff != "" {
t.Fatalf("wrong call log so far\n%s", diff)
}
// We'll reset the log now before we try seeing what happens after
// we use "Stopped".
is.CallLog = is.CallLog[:0]
is.Persisted = nil
hook.Stopping()
if is.Persisted == nil || !is.Persisted.Equal(s) {
t.Fatalf("mismatching state persisted")
}
is.Persisted = nil
hook.PostStateUpdate(s)
if is.Persisted == nil || !is.Persisted.Equal(s) {
t.Fatalf("mismatching state persisted")
}
is.Persisted = nil
hook.PostStateUpdate(s)
if is.Persisted == nil || !is.Persisted.Equal(s) {
t.Fatalf("mismatching state persisted")
}
gotLog = is.CallLog
wantLog = []string{
// "Stopping" immediately persisted
"PersistState",
// PostStateUpdate then writes and persists on every call,
// on the assumption that we're now bailing out after
// being cancelled and trying to save as much state as we can.
"WriteState",
"PersistState",
"WriteState",
"PersistState",
}
if diff := cmp.Diff(wantLog, gotLog); diff != "" {
t.Fatalf("wrong call log once in stopping mode\n%s", diff)
}
}
type testPersistentState struct {
CallLog []string
Written *states.State
Persisted *states.State
}
var _ statemgr.Writer = (*testPersistentState)(nil)
var _ statemgr.Persister = (*testPersistentState)(nil)
func (sm *testPersistentState) WriteState(state *states.State) error {
sm.CallLog = append(sm.CallLog, "WriteState")
sm.Written = state
return nil
}
func (sm *testPersistentState) PersistState(schemas *terraform.Schemas) error {
if schemas == nil {
return fmt.Errorf("no schemas")
}
sm.CallLog = append(sm.CallLog, "PersistState")
sm.Persisted = sm.Written
return nil
}

View File

@ -193,6 +193,12 @@ func (c *Context) Stop() {
c.runContextCancel = nil
}
// Notify all of the hooks that we're stopping, in case they want to try
// to flush in-memory state to disk before a subsequent hard kill.
for _, hook := range c.hooks {
hook.Stopping()
}
// Grab the condition var before we exit
if cond := c.runCond; cond != nil {
log.Printf("[INFO] terraform: waiting for graceful stop to complete")

View File

@ -64,6 +64,148 @@ func TestContext2Apply_basic(t *testing.T) {
}
}
func TestContext2Apply_stop(t *testing.T) {
t.Parallel()
m := testModule(t, "apply-stop")
stopCh := make(chan struct{})
waitCh := make(chan struct{})
stoppedCh := make(chan struct{})
p := &MockProvider{
GetProviderSchemaResponse: &providers.GetProviderSchemaResponse{
ResourceTypes: map[string]providers.Schema{
"indefinite": {
Version: 1,
Block: &configschema.Block{
Attributes: map[string]*configschema.Attribute{
"result": {
Type: cty.String,
Computed: true,
},
},
},
},
},
},
PlanResourceChangeFn: func(prcr providers.PlanResourceChangeRequest) providers.PlanResourceChangeResponse {
log.Printf("[TRACE] TestContext2Apply_stop: no-op PlanResourceChange")
return providers.PlanResourceChangeResponse{
PlannedState: cty.ObjectVal(map[string]cty.Value{
"result": cty.UnknownVal(cty.String),
}),
}
},
ApplyResourceChangeFn: func(arcr providers.ApplyResourceChangeRequest) providers.ApplyResourceChangeResponse {
// This will unblock the main test code once we reach this
// point, so that it'll then be guaranteed to call Stop
// while we're waiting in here.
close(waitCh)
log.Printf("[TRACE] TestContext2Apply_stop: ApplyResourceChange waiting for Stop call")
// This will block until StopFn closes this channel below.
<-stopCh
// This unblocks StopFn below, thereby acknowledging the request
// to stop.
close(stoppedCh)
return providers.ApplyResourceChangeResponse{
NewState: cty.ObjectVal(map[string]cty.Value{
"result": cty.StringVal("complete"),
}),
}
},
StopFn: func() error {
// Closing this channel will unblock the channel read in
// ApplyResourceChangeFn above.
log.Printf("[TRACE] TestContext2Apply_stop: Stop called")
close(stopCh)
// This will block until ApplyResourceChange has reacted to
// being stopped.
log.Printf("[TRACE] TestContext2Apply_stop: Waiting for ApplyResourceChange to react to being stopped")
<-stoppedCh
log.Printf("[TRACE] TestContext2Apply_stop: Stop is completing")
return nil
},
}
hook := &testHook{}
ctx := testContext2(t, &ContextOpts{
Hooks: []Hook{hook},
Providers: map[addrs.Provider]providers.Factory{
addrs.MustParseProviderSourceString("terraform.io/test/indefinite"): testProviderFuncFixed(p),
},
})
plan, diags := ctx.Plan(m, states.NewState(), DefaultPlanOpts)
assertNoErrors(t, diags)
// We'll reset the hook events before we apply because we only care about
// the apply-time events.
hook.Calls = hook.Calls[:0]
// We'll apply in the background so that we can call Stop in the foreground.
stateCh := make(chan *states.State)
go func(plan *plans.Plan) {
state, _ := ctx.Apply(plan, m)
stateCh <- state
}(plan)
// We'll wait until the provider signals that we've reached the
// ApplyResourceChange function, so we can guarantee the expected
// order of operations so our hook events below will always match.
t.Log("waiting for the apply phase to get started")
<-waitCh
// This will block until the apply operation has unwound, so we should
// be able to observe all of the apply side-effects afterwards.
t.Log("waiting for ctx.Stop to return")
ctx.Stop()
t.Log("waiting for apply goroutine to return state")
state := <-stateCh
t.Log("apply is all complete")
if state == nil {
t.Fatalf("final state is nil")
}
// Because we interrupted the apply phase while applying the resource,
// we should have halted immediately after we finished visiting that
// resource. We don't visit indefinite.bar at all.
gotEvents := hook.Calls
wantEvents := []*testHookCall{
{"PreDiff", "indefinite.foo"},
{"PostDiff", "indefinite.foo"},
{"PreApply", "indefinite.foo"},
{"Stopping", ""}, // Local backend uses this as a hint to persist the latest state snapshot
{"PostApply", "indefinite.foo"},
{"PostStateUpdate", ""}, // State gets updated one more time to include the apply result.
}
if diff := cmp.Diff(wantEvents, gotEvents); diff != "" {
t.Errorf("wrong hook events\n%s", diff)
}
rov := state.OutputValue(addrs.OutputValue{Name: "result"}.Absolute(addrs.RootModuleInstance))
if rov != nil && rov.Value != cty.NilVal && !rov.Value.IsNull() {
t.Errorf("'result' output value unexpectedly populated: %#v", rov.Value)
}
resourceAddr := addrs.Resource{
Mode: addrs.ManagedResourceMode,
Type: "indefinite",
Name: "foo",
}.Instance(addrs.NoKey).Absolute(addrs.RootModuleInstance)
rv := state.ResourceInstance(resourceAddr)
if rv == nil || rv.Current == nil {
t.Fatalf("no state entry for %s", resourceAddr)
}
resourceAddr.Resource.Resource.Name = "bar"
rv = state.ResourceInstance(resourceAddr)
if rv != nil && rv.Current != nil {
t.Fatalf("unexpected state entry for %s", resourceAddr)
}
}
func TestContext2Apply_unstable(t *testing.T) {
// This tests behavior when the configuration contains an unstable value,
// such as the result of uuid() or timestamp(), where each call produces

View File

@ -76,6 +76,22 @@ type Hook interface {
PreImportState(addr addrs.AbsResourceInstance, importID string) (HookAction, error)
PostImportState(addr addrs.AbsResourceInstance, imported []providers.ImportedResource) (HookAction, error)
// Stopping is called if an external signal requests that Terraform
// gracefully abort an operation in progress.
//
// This notification might suggest that the user wants Terraform to exit
// ASAP and in that case it's possible that if Terraform runs for too much
// longer then it'll get killed un-gracefully, and so this hook could be
// an opportunity to persist any transient data that would be lost under
// a subsequent kill signal. However, implementations must take care to do
// so in a way that won't cause corruption if the process _is_ killed while
// this hook is still running.
//
// This hook cannot control whether Terraform continues, because the
// graceful shutdown process is typically already running by the time this
// function is called.
Stopping()
// PostStateUpdate is called each time the state is updated. It receives
// a deep copy of the state, which it may therefore access freely without
// any need for locks to protect from concurrent writes from the caller.
@ -140,6 +156,10 @@ func (*NilHook) PostImportState(addr addrs.AbsResourceInstance, imported []provi
return HookActionContinue, nil
}
func (*NilHook) Stopping() {
// Does nothing at all by default
}
func (*NilHook) PostStateUpdate(new *states.State) (HookAction, error) {
return HookActionContinue, nil
}

View File

@ -108,6 +108,8 @@ type MockHook struct {
PostImportStateReturn HookAction
PostImportStateError error
StoppingCalled bool
PostStateUpdateCalled bool
PostStateUpdateState *states.State
PostStateUpdateReturn HookAction
@ -264,6 +266,13 @@ func (h *MockHook) PostImportState(addr addrs.AbsResourceInstance, imported []pr
return h.PostImportStateReturn, h.PostImportStateError
}
func (h *MockHook) Stopping() {
h.Lock()
defer h.Unlock()
h.StoppingCalled = true
}
func (h *MockHook) PostStateUpdate(new *states.State) (HookAction, error) {
h.Lock()
defer h.Unlock()

View File

@ -71,6 +71,8 @@ func (h *stopHook) PostImportState(addr addrs.AbsResourceInstance, imported []pr
return h.hook()
}
func (h *stopHook) Stopping() {}
func (h *stopHook) PostStateUpdate(new *states.State) (HookAction, error) {
return h.hook()
}

View File

@ -124,6 +124,12 @@ func (h *testHook) PostImportState(addr addrs.AbsResourceInstance, imported []pr
return HookActionContinue, nil
}
func (h *testHook) Stopping() {
h.mu.Lock()
defer h.mu.Unlock()
h.Calls = append(h.Calls, &testHookCall{"Stopping", ""})
}
func (h *testHook) PostStateUpdate(new *states.State) (HookAction, error) {
h.mu.Lock()
defer h.mu.Unlock()

View File

@ -0,0 +1,23 @@
terraform {
required_providers {
indefinite = {
source = "terraform.io/test/indefinite"
}
}
}
# The TestContext2Apply_stop test arranges for "indefinite"'s
# ApplyResourceChange to just block indefinitely until the operation
# is cancelled using Context.Stop.
resource "indefinite" "foo" {
}
resource "indefinite" "bar" {
# Should never get here during apply because we're going to interrupt the
# run during indefinite.foo's ApplyResourceChange.
depends_on = [indefinite.foo]
}
output "result" {
value = indefinite.foo.result
}