Improve Swarm support (#333)

* Query for labeled services as well * Try scaling down services * Scale services back up * Use progress tool from Docker CLI * In test, label both services * Clean up error and log messages * Document scale-up/down approach in docs * Downgrade Docker CLI to match client * Document services stats * Do not rely on PreviousSpec for storing desired replica count * Log warnings from Docker when updating services * Check whether container and service labels collide * Document script behavior on label collision * Add additional check if all containers have been removed * Scale services concurrently * Move docker interaction code into own file * Factor out code for service updating * Time out after five minutes of not reaching desired container count * Inline handling of in-swarm container level restart * Timer is more suitable for timeout race * Timeout when scaling down services should be configurable * Choose better filename * Reflect changes in naming * Rename and deprecate BACKUP_STOP_CONTAINER_LABEL * Improve logging * Further simplify logging
2024-11-22 05:10:28 +01:00 · 2024-01-31 12:17:41 +01:00 · 2024-01-31 12:17:41 +01:00 · c3daeacecb
commit c3daeacecb
parent 2065fb2815
18 changed files with 640 additions and 145 deletions
--- a/cmd/backup/config.go
+++ b/cmd/backup/config.go
@ -37,7 +37,9 @@ type Config struct {
 	BackupRetentionDays           int32           `split_words:"true" default:"-1"`
 	BackupPruningLeeway           time.Duration   `split_words:"true" default:"1m"`
 	BackupPruningPrefix           string          `split_words:"true"`
-	BackupStopContainerLabel      string          `split_words:"true" default:"true"`
+	BackupStopContainerLabel      string          `split_words:"true"`
 	BackupStopDuringBackupLabel   string          `split_words:"true" default:"true"`
 	BackupStopServiceTimeout      time.Duration   `split_words:"true" default:"5m"`
 	BackupFromSnapshot            bool            `split_words:"true"`
 	BackupExcludeRegexp           RegexpDecoder   `split_words:"true"`
 	BackupSkipBackendsFromPrune   []string        `split_words:"true"`
--- a/cmd/backup/main.go
+++ b/cmd/backup/main.go
@ -47,12 +47,12 @@ func main() {
 	}()
 	s.must(s.withLabeledCommands(lifecyclePhaseArchive, func() error {
-		restartContainers, err := s.stopContainers()
+		restartContainersAndServices, err := s.stopContainersAndServices()
 		// The mechanism for restarting containers is not using hooks as it
 		// should happen as soon as possible (i.e. before uploading backups or
 		// similar).
 		defer func() {
-			s.must(restartContainers())
+			s.must(restartContainersAndServices())
 		}()
 		if err != nil {
 			return err
--- a/cmd/backup/script.go
+++ b/cmd/backup/script.go
@ -5,8 +5,6 @@ package main
 import (
 	"bytes"
 	"context"
 	"errors"
 	"fmt"
 	"io"
 	"io/fs"
@ -30,10 +28,6 @@ import (
 	openpgp "github.com/ProtonMail/go-crypto/openpgp/v2"
 	"github.com/containrrr/shoutrrr"
 	"github.com/containrrr/shoutrrr/pkg/router"
 	"github.com/docker/docker/api/types"
 	ctr "github.com/docker/docker/api/types/container"
 	"github.com/docker/docker/api/types/filters"
 	"github.com/docker/docker/api/types/swarm"
 	"github.com/docker/docker/client"
 	"github.com/leekchan/timeutil"
 	"github.com/offen/envconfig"
@ -318,126 +312,6 @@ func newScript() (*script, error) {
 	return s, nil
 }
 // stopContainers stops all Docker containers that are marked as to being
 // stopped during the backup and returns a function that can be called to
 // restart everything that has been stopped.
 func (s *script) stopContainers() (func() error, error) {
 	if s.cli == nil {
 		return noop, nil
 	}
 	allContainers, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{})
 	if err != nil {
 		return noop, fmt.Errorf("stopContainers: error querying for containers: %w", err)
 	}
 	containerLabel := fmt.Sprintf(
 		"docker-volume-backup.stop-during-backup=%s",
 		s.c.BackupStopContainerLabel,
 	)
 	containersToStop, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{
 		Filters: filters.NewArgs(filters.KeyValuePair{
 			Key:   "label",
 			Value: containerLabel,
 		}),
 	})
 	if err != nil {
 		return noop, fmt.Errorf("stopContainers: error querying for containers to stop: %w", err)
 	}
 	if len(containersToStop) == 0 {
 		return noop, nil
 	}
 	s.logger.Info(
 		fmt.Sprintf(
 			"Stopping %d container(s) labeled `%s` out of %d running container(s).",
 			len(containersToStop),
 			containerLabel,
 			len(allContainers),
 		),
 	)
 	var stoppedContainers []types.Container
 	var stopErrors []error
 	for _, container := range containersToStop {
 		if err := s.cli.ContainerStop(context.Background(), container.ID, ctr.StopOptions{}); err != nil {
 			stopErrors = append(stopErrors, err)
 		} else {
 			stoppedContainers = append(stoppedContainers, container)
 		}
 	}
 	var stopError error
 	if len(stopErrors) != 0 {
 		stopError = fmt.Errorf(
 			"stopContainers: %d error(s) stopping containers: %w",
 			len(stopErrors),
 			errors.Join(stopErrors...),
 		)
 	}
 	s.stats.Containers = ContainersStats{
 		All:     uint(len(allContainers)),
 		ToStop:  uint(len(containersToStop)),
 		Stopped: uint(len(stoppedContainers)),
 	}
 	return func() error {
 		servicesRequiringUpdate := map[string]struct{}{}
 		var restartErrors []error
 		for _, container := range stoppedContainers {
 			if swarmServiceName, ok := container.Labels["com.docker.swarm.service.name"]; ok {
 				servicesRequiringUpdate[swarmServiceName] = struct{}{}
 				continue
 			}
 			if err := s.cli.ContainerStart(context.Background(), container.ID, types.ContainerStartOptions{}); err != nil {
 				restartErrors = append(restartErrors, err)
 			}
 		}
 		if len(servicesRequiringUpdate) != 0 {
 			services, _ := s.cli.ServiceList(context.Background(), types.ServiceListOptions{})
 			for serviceName := range servicesRequiringUpdate {
 				var serviceMatch swarm.Service
 				for _, service := range services {
 					if service.Spec.Name == serviceName {
 						serviceMatch = service
 						break
 					}
 				}
 				if serviceMatch.ID == "" {
 					return fmt.Errorf("stopContainers: couldn't find service with name %s", serviceName)
 				}
 				serviceMatch.Spec.TaskTemplate.ForceUpdate += 1
 				if _, err := s.cli.ServiceUpdate(
 					context.Background(), serviceMatch.ID,
 					serviceMatch.Version, serviceMatch.Spec, types.ServiceUpdateOptions{},
 				); err != nil {
 					restartErrors = append(restartErrors, err)
 				}
 			}
 		}
 		if len(restartErrors) != 0 {
 			return fmt.Errorf(
 				"stopContainers: %d error(s) restarting containers and services: %w",
 				len(restartErrors),
 				errors.Join(restartErrors...),
 			)
 		}
 		s.logger.Info(
 			fmt.Sprintf(
 				"Restarted %d container(s) and the matching service(s).",
 				len(stoppedContainers),
 			),
 		)
 		return nil
 	}, stopError
 }
 // createArchive creates a tar archive of the configured backup location and
 // saves it to disk.
 func (s *script) createArchive() error {
@ -448,7 +322,7 @@ func (s *script) createArchive() error {
 			"Using BACKUP_FROM_SNAPSHOT has been deprecated and will be removed in the next major version.",
 		)
 		s.logger.Warn(
-			"Please use `archive-pre` and `archive-post` commands to prepare your backup sources. Refer to the README for an upgrade guide.",
+			"Please use `archive-pre` and `archive-post` commands to prepare your backup sources. Refer to the documentation for an upgrade guide.",
 		)
 		backupSources = filepath.Join("/tmp", s.c.BackupSources)
 		// copy before compressing guard against a situation where backup folder's content are still growing.
--- a/cmd/backup/stats.go
+++ b/cmd/backup/stats.go
@ -17,6 +17,15 @@ type ContainersStats struct {
 	StopErrors uint
 }
 // ServicesStats contains info about Swarm services that have been
 // operated upon
 type ServicesStats struct {
 	All             uint
 	ToScaleDown     uint
 	ScaledDown      uint
 	ScaleDownErrors uint
 }
 // BackupFileStats stats about the created backup file
 type BackupFileStats struct {
 	Name     string
@ -40,6 +49,7 @@ type Stats struct {
 	LockedTime time.Duration
 	LogOutput  *bytes.Buffer
 	Containers ContainersStats
 	Services   ServicesStats
 	BackupFile BackupFileStats
 	Storages   map[string]StorageStats
 }
--- a/cmd/backup/stop_restart.go
+++ b/cmd/backup/stop_restart.go
@ -0,0 +1,338 @@
 package main
 import (
 	"context"
 	"errors"
 	"fmt"
 	"io"
 	"os"
 	"sync"
 	"time"
 	"github.com/docker/cli/cli/command/service/progress"
 	"github.com/docker/docker/api/types"
 	ctr "github.com/docker/docker/api/types/container"
 	"github.com/docker/docker/api/types/filters"
 	"github.com/docker/docker/api/types/swarm"
 	"github.com/docker/docker/client"
 )
 func scaleService(cli *client.Client, serviceID string, replicas uint64) ([]string, error) {
 	service, _, err := cli.ServiceInspectWithRaw(context.Background(), serviceID, types.ServiceInspectOptions{})
 	if err != nil {
 		return nil, fmt.Errorf("scaleService: error inspecting service %s: %w", serviceID, err)
 	}
 	serviceMode := &service.Spec.Mode
 	switch {
 	case serviceMode.Replicated != nil:
 		serviceMode.Replicated.Replicas = &replicas
 	default:
 		return nil, fmt.Errorf("scaleService: service to be scaled %s has to be in replicated mode", service.Spec.Name)
 	}
 	response, err := cli.ServiceUpdate(context.Background(), service.ID, service.Version, service.Spec, types.ServiceUpdateOptions{})
 	if err != nil {
 		return nil, fmt.Errorf("scaleService: error updating service: %w", err)
 	}
 	discardWriter := &noopWriteCloser{io.Discard}
 	if err := progress.ServiceProgress(context.Background(), cli, service.ID, discardWriter); err != nil {
 		return nil, err
 	}
 	return response.Warnings, nil
 }
 func awaitContainerCountForService(cli *client.Client, serviceID string, count int, timeoutAfter time.Duration) error {
 	poll := time.NewTicker(time.Second)
 	timeout := time.NewTimer(timeoutAfter)
 	defer timeout.Stop()
 	defer poll.Stop()
 	for {
 		select {
 		case <-timeout.C:
 			return fmt.Errorf(
 				"awaitContainerCount: timed out after waiting %s for service %s to reach desired container count of %d",
 				timeoutAfter,
 				serviceID,
 				count,
 			)
 		case <-poll.C:
 			containers, err := cli.ContainerList(context.Background(), types.ContainerListOptions{
 				Filters: filters.NewArgs(filters.KeyValuePair{
 					Key:   "label",
 					Value: fmt.Sprintf("com.docker.swarm.service.id=%s", serviceID),
 				}),
 			})
 			if err != nil {
 				return fmt.Errorf("awaitContainerCount: error listing containers: %w", err)
 			}
 			if len(containers) == count {
 				return nil
 			}
 		}
 	}
 }
 // stopContainersAndServices stops all Docker containers that are marked as to being
 // stopped during the backup and returns a function that can be called to
 // restart everything that has been stopped.
 func (s *script) stopContainersAndServices() (func() error, error) {
 	if s.cli == nil {
 		return noop, nil
 	}
 	dockerInfo, err := s.cli.Info(context.Background())
 	if err != nil {
 		return noop, fmt.Errorf("(*script).stopContainersAndServices: error getting docker info: %w", err)
 	}
 	isDockerSwarm := dockerInfo.Swarm.LocalNodeState != "inactive"
 	labelValue := s.c.BackupStopDuringBackupLabel
 	if s.c.BackupStopContainerLabel != "" {
 		s.logger.Warn(
 			"Using BACKUP_STOP_CONTAINER_LABEL has been deprecated and will be removed in the next major version.",
 		)
 		s.logger.Warn(
 			"Please use BACKUP_STOP_DURING_BACKUP_LABEL instead. Refer to the docs for an upgrade guide.",
 		)
 		if _, ok := os.LookupEnv("BACKUP_STOP_DURING_BACKUP_LABEL"); ok {
 			return noop, errors.New("(*script).stopContainersAndServices: both BACKUP_STOP_DURING_BACKUP_LABEL and BACKUP_STOP_CONTAINER_LABEL have been set, cannot continue")
 		}
 		labelValue = s.c.BackupStopContainerLabel
 	}
 	filterMatchLabel := fmt.Sprintf(
 		"docker-volume-backup.stop-during-backup=%s",
 		labelValue,
 	)
 	allContainers, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{})
 	if err != nil {
 		return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for containers: %w", err)
 	}
 	containersToStop, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{
 		Filters: filters.NewArgs(filters.KeyValuePair{
 			Key:   "label",
 			Value: filterMatchLabel,
 		}),
 	})
 	if err != nil {
 		return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for containers to stop: %w", err)
 	}
 	var allServices []swarm.Service
 	var servicesToScaleDown []handledSwarmService
 	if isDockerSwarm {
 		allServices, err = s.cli.ServiceList(context.Background(), types.ServiceListOptions{})
 		if err != nil {
 			return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for services: %w", err)
 		}
 		matchingServices, err := s.cli.ServiceList(context.Background(), types.ServiceListOptions{
 			Filters: filters.NewArgs(filters.KeyValuePair{
 				Key:   "label",
 				Value: filterMatchLabel,
 			}),
 			Status: true,
 		})
 		for _, s := range matchingServices {
 			servicesToScaleDown = append(servicesToScaleDown, handledSwarmService{
 				serviceID:           s.ID,
 				initialReplicaCount: *s.Spec.Mode.Replicated.Replicas,
 			})
 		}
 		if err != nil {
 			return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for services to scale down: %w", err)
 		}
 	}
 	if len(containersToStop) == 0 && len(servicesToScaleDown) == 0 {
 		return noop, nil
 	}
 	if isDockerSwarm {
 		for _, container := range containersToStop {
 			if swarmServiceID, ok := container.Labels["com.docker.swarm.service.id"]; ok {
 				parentService, _, err := s.cli.ServiceInspectWithRaw(context.Background(), swarmServiceID, types.ServiceInspectOptions{})
 				if err != nil {
 					return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for parent service with ID %s: %w", swarmServiceID, err)
 				}
 				for label := range parentService.Spec.Labels {
 					if label == "docker-volume-backup.stop-during-backup" {
 						return noop, fmt.Errorf(
 							"(*script).stopContainersAndServices: container %s is labeled to stop but has parent service %s which is also labeled, cannot continue",
 							container.Names[0],
 							parentService.Spec.Name,
 						)
 					}
 				}
 			}
 		}
 	}
 	s.logger.Info(
 		fmt.Sprintf(
 			"Stopping %d out of %d running container(s) as they were labeled %s.",
 			len(containersToStop),
 			len(allContainers),
 			filterMatchLabel,
 		),
 	)
 	if isDockerSwarm {
 		s.logger.Info(
 			fmt.Sprintf(
 				"Scaling down %d out of %d active service(s) as they were labeled %s.",
 				len(servicesToScaleDown),
 				len(allServices),
 				filterMatchLabel,
 			),
 		)
 	}
 	var stoppedContainers []types.Container
 	var stopErrors []error
 	for _, container := range containersToStop {
 		if err := s.cli.ContainerStop(context.Background(), container.ID, ctr.StopOptions{}); err != nil {
 			stopErrors = append(stopErrors, err)
 		} else {
 			stoppedContainers = append(stoppedContainers, container)
 		}
 	}
 	var scaledDownServices []handledSwarmService
 	var scaleDownErrors concurrentSlice[error]
 	if isDockerSwarm {
 		wg := sync.WaitGroup{}
 		for _, svc := range servicesToScaleDown {
 			wg.Add(1)
 			go func(svc handledSwarmService) {
 				defer wg.Done()
 				warnings, err := scaleService(s.cli, svc.serviceID, 0)
 				if err != nil {
 					scaleDownErrors.append(err)
 				} else {
 					scaledDownServices = append(scaledDownServices, svc)
 				}
 				for _, warning := range warnings {
 					s.logger.Warn(
 						fmt.Sprintf("The Docker API returned a warning when scaling down service %s: %s", svc.serviceID, warning),
 					)
 				}
 				// progress.ServiceProgress returns too early, so we need to manually check
 				// whether all containers belonging to the service have actually been removed
 				if err := awaitContainerCountForService(s.cli, svc.serviceID, 0, s.c.BackupStopServiceTimeout); err != nil {
 					scaleDownErrors.append(err)
 				}
 			}(svc)
 		}
 		wg.Wait()
 	}
 	s.stats.Containers = ContainersStats{
 		All:        uint(len(allContainers)),
 		ToStop:     uint(len(containersToStop)),
 		Stopped:    uint(len(stoppedContainers)),
 		StopErrors: uint(len(stopErrors)),
 	}
 	s.stats.Services = ServicesStats{
 		All:             uint(len(allServices)),
 		ToScaleDown:     uint(len(servicesToScaleDown)),
 		ScaledDown:      uint(len(scaledDownServices)),
 		ScaleDownErrors: uint(len(scaleDownErrors.value())),
 	}
 	var initialErr error
 	allErrors := append(stopErrors, scaleDownErrors.value()...)
 	if len(allErrors) != 0 {
 		initialErr = fmt.Errorf(
 			"(*script).stopContainersAndServices: %d error(s) stopping containers: %w",
 			len(allErrors),
 			errors.Join(allErrors...),
 		)
 	}
 	return func() error {
 		var restartErrors []error
 		matchedServices := map[string]bool{}
 		for _, container := range stoppedContainers {
 			if swarmServiceID, ok := container.Labels["com.docker.swarm.service.id"]; ok && isDockerSwarm {
 				if _, ok := matchedServices[swarmServiceID]; ok {
 					continue
 				}
 				matchedServices[swarmServiceID] = true
 				// in case a container was part of a swarm service, the service requires to
 				// be force updated instead of restarting the container as it would otherwise
 				// remain in a "completed" state
 				service, _, err := s.cli.ServiceInspectWithRaw(context.Background(), swarmServiceID, types.ServiceInspectOptions{})
 				if err != nil {
 					restartErrors = append(
 						restartErrors,
 						fmt.Errorf("(*script).stopContainersAndServices: error looking up parent service: %w", err),
 					)
 					continue
 				}
 				service.Spec.TaskTemplate.ForceUpdate += 1
 				if _, err := s.cli.ServiceUpdate(
 					context.Background(), service.ID,
 					service.Version, service.Spec, types.ServiceUpdateOptions{},
 				); err != nil {
 					restartErrors = append(restartErrors, err)
 				}
 				continue
 			}
 			if err := s.cli.ContainerStart(context.Background(), container.ID, types.ContainerStartOptions{}); err != nil {
 				restartErrors = append(restartErrors, err)
 			}
 		}
 		var scaleUpErrors concurrentSlice[error]
 		if isDockerSwarm {
 			wg := &sync.WaitGroup{}
 			for _, svc := range servicesToScaleDown {
 				wg.Add(1)
 				go func(svc handledSwarmService) {
 					defer wg.Done()
 					warnings, err := scaleService(s.cli, svc.serviceID, svc.initialReplicaCount)
 					if err != nil {
 						scaleDownErrors.append(err)
 						return
 					}
 					for _, warning := range warnings {
 						s.logger.Warn(
 							fmt.Sprintf("The Docker API returned a warning when scaling up service %s: %s", svc.serviceID, warning),
 						)
 					}
 				}(svc)
 			}
 			wg.Wait()
 		}
 		allErrors := append(restartErrors, scaleUpErrors.value()...)
 		if len(allErrors) != 0 {
 			return fmt.Errorf(
 				"(*script).stopContainersAndServices: %d error(s) restarting containers and services: %w",
 				len(allErrors),
 				errors.Join(allErrors...),
 			)
 		}
 		s.logger.Info(
 			fmt.Sprintf(
 				"Restarted %d container(s).",
 				len(stoppedContainers),
 			),
 		)
 		if isDockerSwarm {
 			s.logger.Info(
 				fmt.Sprintf(
 					"Scaled %d service(s) back up.",
 					len(scaledDownServices),
 				),
 			)
 		}
 		return nil
 	}, initialErr
 }
--- a/cmd/backup/util.go
+++ b/cmd/backup/util.go
@ -8,6 +8,7 @@ import (
 	"fmt"
 	"io"
 	"os"
 	"sync"
 )
 var noop = func() error { return nil }
@ -50,3 +51,31 @@ func (b *bufferingWriter) Write(p []byte) (n int, err error) {
 	}
 	return b.writer.Write(p)
 }
 type noopWriteCloser struct {
 	io.Writer
 }
 func (noopWriteCloser) Close() error {
 	return nil
 }
 type handledSwarmService struct {
 	serviceID           string
 	initialReplicaCount uint64
 }
 type concurrentSlice[T any] struct {
 	val []T
 	sync.Mutex
 }
 func (c *concurrentSlice[T]) append(v T) {
 	c.Lock()
 	defer c.Unlock()
 	c.val = append(c.val, v)
 }
 func (c *concurrentSlice[T]) value() []T {
 	return c.val
 }
--- a/docs/how-tos/replace-deprecated-backup-stop-container-label.md
+++ b/docs/how-tos/replace-deprecated-backup-stop-container-label.md
@ -0,0 +1,19 @@
 ---
 title: Replace deprecated BACKUP_STOP_CONTAINER_LABEL setting
 layout: default
 parent: How Tos
 nav_order: 19
 ---
 # Replace deprecated `BACKUP_STOP_CONTAINER_LABEL` setting
 Version `v2.36.0` deprecated the `BACKUP_STOP_CONTAINER_LABEL` setting and renamed it `BACKUP_STOP_DURING_BACKUP_LABEL` which is supposed to signal that this will stop both containers _and_ services.
 Migrating is done by renaming the key for your custom value:
 ```diff
    env:
 -     BACKUP_STOP_CONTAINER_LABEL: database
 +     BACKUP_STOP_DURING_BACKUP_LABEL: database
 ```
 The old key will stay supported until the next major version, but logs a warning each time a backup is taken.
--- a/docs/how-tos/set-up-notifications.md
+++ b/docs/how-tos/set-up-notifications.md
@ -76,7 +76,7 @@ Configuration, data about the backup run and helper functions will be passed to
 Here is a list of all data passed to the template:
-* `Config`: this object holds the configuration that has been passed to the script. The field names are the name of the recognized environment variables converted in PascalCase. (e.g. `BACKUP_STOP_CONTAINER_LABEL` becomes `BackupStopContainerLabel`)
+* `Config`: this object holds the configuration that has been passed to the script. The field names are the name of the recognized environment variables converted in PascalCase. (e.g. `BACKUP_STOP_DURING_BACKUP_LABEL` becomes `BackupStopDuringBackupLabel`)
 * `Error`: the error that made the backup fail. Only available in the `title_failure` and `body_failure` templates
 * `Stats`: objects that holds stats regarding script execution. In case of an unsuccessful run, some information may not be available.
  * `StartTime`: time when the script started execution
@ -89,6 +89,11 @@ Here is a list of all data passed to the template:
    * `ToStop`: number of containers matched by the stop rule
    * `Stopped`: number of containers successfully stopped
    * `StopErrors`: number of containers that were unable to be stopped (equal to `ToStop - Stopped`)
  * `Services`: object containing stats about the docker services (only populated when Docker is running in Swarm mode)
    * `All`: total number of services
    * `ToScaleDown`: number of containers matched by the scale down rule
    * `ScaledDwon`: number of containers successfully scaled down
    * `ScaleDownErrors`: number of containers that were unable to be stopped (equal to `ToScaleDown - ScaledDowm`)
  * `BackupFile`: object containing information about the backup file
    * `Name`: name of the backup file (e.g. `backup-2022-02-11T01-00-00.tar.gz`)
    * `FullPath`: full path of the backup file (e.g. `/archive/backup-2022-02-11T01-00-00.tar.gz`)
--- a/docs/how-tos/stop-containers-during-backup.md
+++ b/docs/how-tos/stop-containers-during-backup.md
@ -7,11 +7,14 @@ nav_order: 1
 # Stop containers during backup
 {: .note }
 In case you are running Docker in Swarm mode, [dedicated documentation](./use-with-docker-swarm.html) on service and container restart applies.
 In many cases, it will be desirable to stop the services that are consuming the volume you want to backup in order to ensure data integrity.
 This image can automatically stop and restart containers and services.
 By default, any container that is labeled `docker-volume-backup.stop-during-backup=true` will be stopped before the backup is being taken and restarted once it has finished.
-In case you need more fine grained control about which containers should be stopped (e.g. when backing up multiple volumes on different schedules), you can set the `BACKUP_STOP_CONTAINER_LABEL` environment variable and then use the same value for labeling:
+In case you need more fine grained control about which containers should be stopped (e.g. when backing up multiple volumes on different schedules), you can set the `BACKUP_STOP_DURING_BACKUP_LABEL` environment variable and then use the same value for labeling:
 ```yml
 version: '3'
@ -25,7 +28,7 @@ services:
  backup:
    image: offen/docker-volume-backup:v2
    environment:
-      BACKUP_STOP_CONTAINER_LABEL: service1
+      BACKUP_STOP_DURING_BACKUP_LABEL: service1
    volumes:
      - data:/backup/my-app-backup:ro
      - /var/run/docker.sock:/var/run/docker.sock:ro
--- a/docs/how-tos/use-with-docker-swarm.md
+++ b/docs/how-tos/use-with-docker-swarm.md
@ -7,12 +7,66 @@ nav_order: 13
 # Use with Docker Swarm
-By default, Docker Swarm will restart stopped containers automatically, even when manually stopped.
+{: .note }
-If you plan to have your containers / services stopped during backup, this means you need to apply the `on-failure` restart policy to your service's definitions.
+The mechanisms described in this page __do only apply when Docker is running in [Swarm mode][swarm]__.
-A restart policy of `always` is not compatible with this tool.
+
 [swarm]: https://docs.docker.com/engine/swarm/
 ## Stopping containers during backup
 Stopping and restarting containers during backup creation when running Docker in Swarm mode is supported in two ways.
 {: .important }
 Make sure you label your services and containers using only one of the describe approaches.
 In case the script encounters a container that is labeled and has a parent service that is also labeled, it will exit early.
 ### Scaling services down to zero before scaling back up
 When labeling a service in the `deploy` section, the following strategy for stopping and restarting will be used:
 - The service is scaled down to zero replicas
 - The backup is created
 - The service is scaled back up to the previous number of replicas
 {: .note }
 This approach will only work for services that are deployed in __replicated mode__.
 Such a service definition could look like:
 ```yml
 services:
  app:
    image: myorg/myimage:latest
    deploy:
      labels:
        - docker-volume-backup.stop-during-backup=true
      replicas: 2
 ```
 ### Stopping the containers
 This approach bypasses the services and stops containers directly, creates the backup and restarts the containers again.
 As Docker Swarm would usually try to instantly restart containers that are manually stopped, this approach only works when using the `on-failure` restart policy.
 A restart policy of `always` is not compatible with this approach.
 Such a service definition could look like:
 ```yml
 services:
  app:
    image: myapp/myimage:latest
    labels:
      - docker-volume-backup.stop-during-backup=true
    deploy:
      replicas: 2
      restart_policy:
        condition: on-failure
 ```
 ---
 ## Memory limit considerations
 When running in Swarm mode, it's also advised to set a hard memory limit on your service (~25MB should be enough in most cases, but if you backup large files above half a gigabyte or similar, you might have to raise this in case the backup exits with `Killed`):
 ```yml
--- a/docs/recipes/index.md
+++ b/docs/recipes/index.md
@ -352,7 +352,7 @@ services:
      AWS_ACCESS_KEY_ID: AKIAIOSFODNN7EXAMPLE
      AWS_SECRET_ACCESS_KEY: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY
      # Label the container using the `data_1` volume as `docker-volume-backup.stop-during-backup=service1`
-      BACKUP_STOP_CONTAINER_LABEL: service1
+      BACKUP_STOP_DURING_BACKUP_LABEL: service1
    volumes:
      - data_1:/backup/data-1-backup:ro
      - /var/run/docker.sock:/var/run/docker.sock:ro
@ -362,7 +362,7 @@ services:
      <<: *backup_environment
      # Label the container using the `data_2` volume as `docker-volume-backup.stop-during-backup=service2`
      BACKUP_CRON_EXPRESSION: "0 3 * * *"
-      BACKUP_STOP_CONTAINER_LABEL: service2
+      BACKUP_STOP_DURING_BACKUP_LABEL: service2
    volumes:
      - data_2:/backup/data-2-backup:ro
      - /var/run/docker.sock:/var/run/docker.sock:ro
--- a/docs/reference/index.md
+++ b/docs/reference/index.md
@ -316,15 +316,22 @@ You can populate below template according to your requirements and use it as you
 # GPG_PASSPHRASE="<xxx>"
-########### STOPPING CONTAINERS DURING BACKUP
+########### STOPPING CONTAINERS AND SERVICES DURING BACKUP
-# Containers can be stopped by applying a
+# Containers or services can be stopped by applying a
-# `docker-volume-backup.stop-during-backup` label. By default, all containers
+# `docker-volume-backup.stop-during-backup` label. By default, all containers and
-# that are labeled with `true` will be stopped. If you need more fine grained
+# services that are labeled with `true` will be stopped. If you need more fine
-# control (e.g. when running multiple containers based on this image), you can
+# grained control (e.g. when running multiple containers based on this image),
-# override this default by specifying a different value here.
+# you can override this default by specifying a different value here.
 # BACKUP_STOP_DURING_BACKUP_LABEL="service1"
-# BACKUP_STOP_CONTAINER_LABEL="service1"
+# When trying to scale down Docker Swarm services, give up after
 # the specified amount of time in case the service has not converged yet.
 # In case you need to adjust this timeout, supply a duration
 # value as per https://pkg.go.dev/time#ParseDuration to `BACKUP_STOP_SERVICE_TIMEOUT`.
 # Defaults to 5 minutes.
 # BACKUP_STOP_SERVICE_TIMEOUT="5m"
 ########### EXECUTING COMMANDS IN CONTAINERS PRE/POST BACKUP
--- a/go.mod
+++ b/go.mod
@ -7,6 +7,7 @@ require (
 	github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.2.1
 	github.com/containrrr/shoutrrr v0.7.1
 	github.com/cosiner/argv v0.1.0
 	github.com/docker/cli v24.0.1+incompatible
 	github.com/docker/docker v24.0.7+incompatible
 	github.com/gofrs/flock v0.8.1
 	github.com/klauspost/compress v1.17.5
@ -22,9 +23,11 @@ require (
 )
 require (
 	github.com/Azure/go-ansiterm v0.0.0-20170929234023-d6e3b3328b78 // indirect
 	github.com/cloudflare/circl v1.3.7 // indirect
 	github.com/golang-jwt/jwt/v5 v5.2.0 // indirect
 	github.com/golang/protobuf v1.5.3 // indirect
 	golang.org/x/time v0.0.0-20220609170525-579cf78fd858 // indirect
 	google.golang.org/appengine v1.6.7 // indirect
 	google.golang.org/protobuf v1.31.0 // indirect
 )
--- a/go.sum
+++ b/go.sum
@ -253,6 +253,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/dnaeon/go-vcr v1.2.0 h1:zHCHvJYTMh1N7xnV7zf1m1GPBF9Ad0Jk/whtQ1663qI=
 github.com/dnaeon/go-vcr v1.2.0/go.mod h1:R4UdLID7HZT3taECzJs4YgbbH6PIGXB6W/sc5OLb6RQ=
 github.com/docker/cli v24.0.1+incompatible h1:uVl5Xv/39kZJpDo9VaktTOYBc702sdYYF33FqwUG/dM=
 github.com/docker/cli v24.0.1+incompatible/go.mod h1:JLrzqnKDaYBop7H2jaqPtU4hHvMKP+vjCwu2uszcLI8=
 github.com/docker/distribution v2.8.2+incompatible h1:T3de5rq0dB1j30rp0sA2rER+m322EBzniBPB6ZIzuh8=
 github.com/docker/distribution v2.8.2+incompatible/go.mod h1:J2gT2udsDAN96Uj4KfcMRqY0/ypR+oyYUYmja8H+y+w=
 github.com/docker/docker v24.0.7+incompatible h1:Wo6l37AuwP3JaMnZa226lzVXGA3F9Ig1seQen0cKYlM=
@ -1241,6 +1243,7 @@ gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C
 gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gotest.tools v2.2.0+incompatible h1:VsBPFP1AI068pPrMxtb/S8Zkgf9xEmTLJjfM+P5UIEo=
 gotest.tools v2.2.0+incompatible/go.mod h1:DsYFclhRJ6vuDpmuTbkuFWG+y2sxOXAzmJt81HFBacw=
 gotest.tools/v3 v3.0.2/go.mod h1:3SzNCllyD9/Y+b5r9JIKQ474KzkZyqLqEfYqMsX94Bk=
 gotest.tools/v3 v3.0.3 h1:4AuOwCGf4lLR9u3YOe2awrHygurzhO/HeQ6laiA6Sx0=
--- a/test/collision/docker-compose.yml
+++ b/test/collision/docker-compose.yml
@ -0,0 +1,28 @@
 # Copyright 2020-2021 - Offen Authors <hioffen@posteo.de>
 # SPDX-License-Identifier: Unlicense
 version: '3.8'
 services:
  backup:
    image: offen/docker-volume-backup:${TEST_VERSION:-canary}
    environment:
      BACKUP_FILENAME: test.tar.gz
    volumes:
      - offen_data:/backup/offen_data:ro
      - ${LOCAL_DIR:-./local}:/archive
      - /var/run/docker.sock:/var/run/docker.sock
  offen:
    image: offen/offen:latest
    labels:
      - docker-volume-backup.stop-during-backup=true
    deploy:
      labels:
        - docker-volume-backup.stop-during-backup=true
      replicas: 2
    volumes:
      - offen_data:/var/opt/offen
 volumes:
  offen_data:
--- a/test/collision/run.sh
+++ b/test/collision/run.sh
@ -0,0 +1,34 @@
 #!/bin/sh
 set -e
 cd $(dirname $0)
 . ../util.sh
 current_test=$(basename $(pwd))
 export LOCAL_DIR=$(mktemp -d)
 docker swarm init
 docker stack deploy --compose-file=docker-compose.yml test_stack
 while [ -z $(docker ps -q -f name=backup) ]; do
  info "Backup container not ready yet. Retrying."
  sleep 1
 done
 sleep 20
 set +e
 docker exec $(docker ps -q -f name=backup) backup
 if [ $? = "0" ]; then
  fail "Expected script to exit with error code."
 fi
 if [ -f "${LOCAL_DIR}/test.tar.gz" ]; then
  fail "Found backup file that should not have been created."
 fi
 expect_running_containers "3"
 pass "Script did not perform backup as there was a label collision."
--- a/test/services/docker-compose.yml
+++ b/test/services/docker-compose.yml
@ -0,0 +1,57 @@
 # Copyright 2020-2021 - Offen Authors <hioffen@posteo.de>
 # SPDX-License-Identifier: Unlicense
 version: '3.8'
 services:
  minio:
    image: minio/minio:RELEASE.2020-08-04T23-10-51Z
    environment:
      MINIO_ROOT_USER: test
      MINIO_ROOT_PASSWORD: test
      MINIO_ACCESS_KEY: test
      MINIO_SECRET_KEY: GMusLtUmILge2by+z890kQ
    entrypoint: /bin/ash -c 'mkdir -p /data/backup && minio server /data'
    volumes:
      - backup_data:/data
  backup:
    image: offen/docker-volume-backup:${TEST_VERSION:-canary}
    depends_on:
      - minio
    environment:
      AWS_ACCESS_KEY_ID: test
      AWS_SECRET_ACCESS_KEY: GMusLtUmILge2by+z890kQ
      AWS_ENDPOINT: minio:9000
      AWS_ENDPOINT_PROTO: http
      AWS_S3_BUCKET_NAME: backup
      BACKUP_FILENAME: test.tar.gz
      BACKUP_CRON_EXPRESSION: 0 0 5 31 2 ?
      BACKUP_RETENTION_DAYS: 7
      BACKUP_PRUNING_LEEWAY: 5s
    volumes:
      - pg_data:/backup/pg_data:ro
      - /var/run/docker.sock:/var/run/docker.sock
  offen:
    image: offen/offen:latest
    deploy:
      labels:
        - docker-volume-backup.stop-during-backup=true
      replicas: 2
  pg:
    image: postgres:14-alpine
    environment:
      POSTGRES_PASSWORD: example
    volumes:
      - pg_data:/var/lib/postgresql/data
    deploy:
      labels:
        - docker-volume-backup.stop-during-backup=true
 volumes:
  backup_data:
    name: backup_data
  pg_data:
    name: pg_data
--- a/test/services/run.sh
+++ b/test/services/run.sh
@ -0,0 +1,29 @@
 #!/bin/sh
 set -e
 cd $(dirname $0)
 . ../util.sh
 current_test=$(basename $(pwd))
 docker swarm init
 docker stack deploy --compose-file=docker-compose.yml test_stack
 while [ -z $(docker ps -q -f name=backup) ]; do
  info "Backup container not ready yet. Retrying."
  sleep 1
 done
 sleep 20
 docker exec $(docker ps -q -f name=backup) backup
 docker run --rm \
  -v backup_data:/data alpine \
  ash -c 'tar -xf /data/backup/test.tar.gz && test -f /backup/pg_data/PG_VERSION'
 pass "Found relevant files in untared backup."
 sleep 5
 expect_running_containers "5"