2024-01-27 18:41:03 +01:00
package main
import (
"context"
"errors"
"fmt"
"io"
2024-01-29 15:15:29 +01:00
"os"
2024-01-27 18:41:03 +01:00
"sync"
"time"
"github.com/docker/cli/cli/command/service/progress"
"github.com/docker/docker/api/types"
ctr "github.com/docker/docker/api/types/container"
"github.com/docker/docker/api/types/filters"
"github.com/docker/docker/api/types/swarm"
2024-01-27 19:26:39 +01:00
"github.com/docker/docker/client"
2024-01-27 18:41:03 +01:00
)
2024-01-27 19:26:39 +01:00
func scaleService ( cli * client . Client , serviceID string , replicas uint64 ) ( [ ] string , error ) {
service , _ , err := cli . ServiceInspectWithRaw ( context . Background ( ) , serviceID , types . ServiceInspectOptions { } )
if err != nil {
return nil , fmt . Errorf ( "scaleService: error inspecting service %s: %w" , serviceID , err )
}
serviceMode := & service . Spec . Mode
switch {
case serviceMode . Replicated != nil :
serviceMode . Replicated . Replicas = & replicas
default :
return nil , fmt . Errorf ( "scaleService: service to be scaled %s has to be in replicated mode" , service . Spec . Name )
}
response , err := cli . ServiceUpdate ( context . Background ( ) , service . ID , service . Version , service . Spec , types . ServiceUpdateOptions { } )
if err != nil {
return nil , fmt . Errorf ( "scaleService: error updating service: %w" , err )
}
discardWriter := & noopWriteCloser { io . Discard }
if err := progress . ServiceProgress ( context . Background ( ) , cli , service . ID , discardWriter ) ; err != nil {
return nil , err
}
return response . Warnings , nil
}
2024-01-28 14:35:02 +01:00
func awaitContainerCountForService ( cli * client . Client , serviceID string , count int , timeoutAfter time . Duration ) error {
2024-01-27 19:42:21 +01:00
poll := time . NewTicker ( time . Second )
2024-01-28 14:35:02 +01:00
timeout := time . NewTimer ( timeoutAfter )
2024-01-27 19:42:21 +01:00
defer timeout . Stop ( )
defer poll . Stop ( )
2024-01-27 19:26:39 +01:00
for {
2024-01-27 19:42:21 +01:00
select {
case <- timeout . C :
return fmt . Errorf (
2024-01-28 14:35:02 +01:00
"awaitContainerCount: timed out after waiting %s for service %s to reach desired container count of %d" ,
timeoutAfter ,
2024-01-27 19:42:21 +01:00
serviceID ,
count ,
)
case <- poll . C :
containers , err := cli . ContainerList ( context . Background ( ) , types . ContainerListOptions {
Filters : filters . NewArgs ( filters . KeyValuePair {
Key : "label" ,
Value : fmt . Sprintf ( "com.docker.swarm.service.id=%s" , serviceID ) ,
} ) ,
} )
if err != nil {
return fmt . Errorf ( "awaitContainerCount: error listing containers: %w" , err )
}
if len ( containers ) == count {
return nil
}
2024-01-27 19:26:39 +01:00
}
}
}
2024-01-27 18:41:03 +01:00
// stopContainersAndServices stops all Docker containers that are marked as to being
// stopped during the backup and returns a function that can be called to
// restart everything that has been stopped.
func ( s * script ) stopContainersAndServices ( ) ( func ( ) error , error ) {
if s . cli == nil {
return noop , nil
}
dockerInfo , err := s . cli . Info ( context . Background ( ) )
if err != nil {
return noop , fmt . Errorf ( "(*script).stopContainersAndServices: error getting docker info: %w" , err )
}
isDockerSwarm := dockerInfo . Swarm . LocalNodeState != "inactive"
2024-01-29 15:15:29 +01:00
labelValue := s . c . BackupStopDuringBackupLabel
if s . c . BackupStopContainerLabel != "" {
s . logger . Warn (
"Using BACKUP_STOP_CONTAINER_LABEL has been deprecated and will be removed in the next major version." ,
)
s . logger . Warn (
"Please use BACKUP_STOP_DURING_BACKUP_LABEL instead. Refer to the docs for an upgrade guide." ,
)
if _ , ok := os . LookupEnv ( "BACKUP_STOP_DURING_BACKUP_LABEL" ) ; ok {
return noop , errors . New ( "(*script).stopContainersAndServices: both BACKUP_STOP_DURING_BACKUP_LABEL and BACKUP_STOP_CONTAINER_LABEL have been set, cannot continue" )
}
labelValue = s . c . BackupStopContainerLabel
}
2024-01-27 18:41:03 +01:00
filterMatchLabel := fmt . Sprintf (
"docker-volume-backup.stop-during-backup=%s" ,
2024-01-29 15:15:29 +01:00
labelValue ,
2024-01-27 18:41:03 +01:00
)
allContainers , err := s . cli . ContainerList ( context . Background ( ) , types . ContainerListOptions { } )
if err != nil {
return noop , fmt . Errorf ( "(*script).stopContainersAndServices: error querying for containers: %w" , err )
}
containersToStop , err := s . cli . ContainerList ( context . Background ( ) , types . ContainerListOptions {
Filters : filters . NewArgs ( filters . KeyValuePair {
Key : "label" ,
Value : filterMatchLabel ,
} ) ,
} )
if err != nil {
return noop , fmt . Errorf ( "(*script).stopContainersAndServices: error querying for containers to stop: %w" , err )
}
var allServices [ ] swarm . Service
var servicesToScaleDown [ ] handledSwarmService
if isDockerSwarm {
allServices , err = s . cli . ServiceList ( context . Background ( ) , types . ServiceListOptions { } )
if err != nil {
return noop , fmt . Errorf ( "(*script).stopContainersAndServices: error querying for services: %w" , err )
}
matchingServices , err := s . cli . ServiceList ( context . Background ( ) , types . ServiceListOptions {
Filters : filters . NewArgs ( filters . KeyValuePair {
Key : "label" ,
Value : filterMatchLabel ,
} ) ,
Status : true ,
} )
for _ , s := range matchingServices {
servicesToScaleDown = append ( servicesToScaleDown , handledSwarmService {
serviceID : s . ID ,
initialReplicaCount : * s . Spec . Mode . Replicated . Replicas ,
} )
}
if err != nil {
return noop , fmt . Errorf ( "(*script).stopContainersAndServices: error querying for services to scale down: %w" , err )
}
}
if len ( containersToStop ) == 0 && len ( servicesToScaleDown ) == 0 {
return noop , nil
}
if isDockerSwarm {
for _ , container := range containersToStop {
if swarmServiceID , ok := container . Labels [ "com.docker.swarm.service.id" ] ; ok {
parentService , _ , err := s . cli . ServiceInspectWithRaw ( context . Background ( ) , swarmServiceID , types . ServiceInspectOptions { } )
if err != nil {
return noop , fmt . Errorf ( "(*script).stopContainersAndServices: error querying for parent service with ID %s: %w" , swarmServiceID , err )
}
for label := range parentService . Spec . Labels {
if label == "docker-volume-backup.stop-during-backup" {
return noop , fmt . Errorf (
"(*script).stopContainersAndServices: container %s is labeled to stop but has parent service %s which is also labeled, cannot continue" ,
container . Names [ 0 ] ,
parentService . Spec . Name ,
)
}
}
}
}
}
2024-01-29 16:20:50 +01:00
if isDockerSwarm {
s . logger . Info (
fmt . Sprintf (
"Stopping %d out of %d running container(s) and scaling down %d out of %d active service(s) as they were labeled %s." ,
len ( containersToStop ) ,
len ( allContainers ) ,
len ( servicesToScaleDown ) ,
len ( allServices ) ,
filterMatchLabel ,
) ,
)
} else {
s . logger . Info (
fmt . Sprintf (
"Stopping %d out of %d running container(s) as they were labeled %s." ,
len ( containersToStop ) ,
len ( allContainers ) ,
filterMatchLabel ,
) ,
)
}
2024-01-27 18:41:03 +01:00
var stoppedContainers [ ] types . Container
var stopErrors [ ] error
for _ , container := range containersToStop {
if err := s . cli . ContainerStop ( context . Background ( ) , container . ID , ctr . StopOptions { } ) ; err != nil {
stopErrors = append ( stopErrors , err )
} else {
stoppedContainers = append ( stoppedContainers , container )
}
}
2024-01-27 19:26:39 +01:00
var scaledDownServices [ ] handledSwarmService
2024-01-27 18:41:03 +01:00
var scaleDownErrors concurrentSlice [ error ]
if isDockerSwarm {
wg := sync . WaitGroup { }
for _ , svc := range servicesToScaleDown {
wg . Add ( 1 )
go func ( svc handledSwarmService ) {
defer wg . Done ( )
2024-01-27 19:26:39 +01:00
warnings , err := scaleService ( s . cli , svc . serviceID , 0 )
2024-01-27 18:41:03 +01:00
if err != nil {
scaleDownErrors . append ( err )
2024-01-27 19:26:39 +01:00
} else {
scaledDownServices = append ( scaledDownServices , svc )
2024-01-27 18:41:03 +01:00
}
2024-01-27 19:26:39 +01:00
for _ , warning := range warnings {
2024-01-27 18:41:03 +01:00
s . logger . Warn (
2024-01-27 19:26:39 +01:00
fmt . Sprintf ( "The Docker API returned a warning when scaling down service %s: %s" , svc . serviceID , warning ) ,
2024-01-27 18:41:03 +01:00
)
}
// progress.ServiceProgress returns too early, so we need to manually check
// whether all containers belonging to the service have actually been removed
2024-01-28 14:35:02 +01:00
if err := awaitContainerCountForService ( s . cli , svc . serviceID , 0 , s . c . BackupStopServiceTimeout ) ; err != nil {
2024-01-27 19:26:39 +01:00
scaleDownErrors . append ( err )
2024-01-27 18:41:03 +01:00
}
} ( svc )
}
wg . Wait ( )
}
s . stats . Containers = ContainersStats {
All : uint ( len ( allContainers ) ) ,
ToStop : uint ( len ( containersToStop ) ) ,
Stopped : uint ( len ( stoppedContainers ) ) ,
StopErrors : uint ( len ( stopErrors ) ) ,
}
s . stats . Services = ServicesStats {
All : uint ( len ( allServices ) ) ,
ToScaleDown : uint ( len ( servicesToScaleDown ) ) ,
ScaledDown : uint ( len ( scaledDownServices ) ) ,
ScaleDownErrors : uint ( len ( scaleDownErrors . value ( ) ) ) ,
}
var initialErr error
allErrors := append ( stopErrors , scaleDownErrors . value ( ) ... )
if len ( allErrors ) != 0 {
initialErr = fmt . Errorf (
"(*script).stopContainersAndServices: %d error(s) stopping containers: %w" ,
len ( allErrors ) ,
errors . Join ( allErrors ... ) ,
)
}
return func ( ) error {
var restartErrors [ ] error
2024-01-27 19:48:50 +01:00
matchedServices := map [ string ] bool { }
2024-01-27 18:41:03 +01:00
for _ , container := range stoppedContainers {
2024-01-27 19:48:50 +01:00
if swarmServiceID , ok := container . Labels [ "com.docker.swarm.service.id" ] ; ok && isDockerSwarm {
if _ , ok := matchedServices [ swarmServiceID ] ; ok {
continue
2024-01-27 18:41:03 +01:00
}
2024-01-27 19:48:50 +01:00
matchedServices [ swarmServiceID ] = true
// in case a container was part of a swarm service, the service requires to
// be force updated instead of restarting the container as it would otherwise
// remain in a "completed" state
service , _ , err := s . cli . ServiceInspectWithRaw ( context . Background ( ) , swarmServiceID , types . ServiceInspectOptions { } )
if err != nil {
2024-01-27 18:41:03 +01:00
restartErrors = append (
restartErrors ,
2024-01-27 19:48:50 +01:00
fmt . Errorf ( "(*script).stopContainersAndServices: error looking up parent service: %w" , err ) ,
2024-01-27 18:41:03 +01:00
)
continue
}
2024-01-27 19:48:50 +01:00
service . Spec . TaskTemplate . ForceUpdate += 1
2024-01-27 18:41:03 +01:00
if _ , err := s . cli . ServiceUpdate (
2024-01-27 19:48:50 +01:00
context . Background ( ) , service . ID ,
service . Version , service . Spec , types . ServiceUpdateOptions { } ,
2024-01-27 18:41:03 +01:00
) ; err != nil {
restartErrors = append ( restartErrors , err )
}
2024-01-27 19:48:50 +01:00
continue
}
if err := s . cli . ContainerStart ( context . Background ( ) , container . ID , types . ContainerStartOptions { } ) ; err != nil {
restartErrors = append ( restartErrors , err )
2024-01-27 18:41:03 +01:00
}
}
var scaleUpErrors concurrentSlice [ error ]
if isDockerSwarm {
wg := & sync . WaitGroup { }
for _ , svc := range servicesToScaleDown {
wg . Add ( 1 )
go func ( svc handledSwarmService ) {
defer wg . Done ( )
2024-01-27 19:26:39 +01:00
warnings , err := scaleService ( s . cli , svc . serviceID , svc . initialReplicaCount )
2024-01-27 18:41:03 +01:00
if err != nil {
2024-01-27 19:26:39 +01:00
scaleDownErrors . append ( err )
2024-01-27 18:41:03 +01:00
return
}
2024-01-27 19:26:39 +01:00
for _ , warning := range warnings {
2024-01-27 18:41:03 +01:00
s . logger . Warn (
2024-01-27 19:26:39 +01:00
fmt . Sprintf ( "The Docker API returned a warning when scaling up service %s: %s" , svc . serviceID , warning ) ,
2024-01-27 18:41:03 +01:00
)
}
} ( svc )
}
wg . Wait ( )
}
allErrors := append ( restartErrors , scaleUpErrors . value ( ) ... )
if len ( allErrors ) != 0 {
return fmt . Errorf (
2024-01-27 19:48:50 +01:00
"(*script).stopContainersAndServices: %d error(s) restarting containers and services: %w" ,
2024-01-27 18:41:03 +01:00
len ( allErrors ) ,
errors . Join ( allErrors ... ) ,
)
}
2024-01-29 16:20:50 +01:00
if isDockerSwarm {
s . logger . Info (
fmt . Sprintf (
"Restarted %d container(s) and %d service(s)." ,
len ( stoppedContainers ) ,
len ( scaledDownServices ) ,
) ,
)
} else {
s . logger . Info (
fmt . Sprintf (
"Restarted %d container(s)." ,
len ( stoppedContainers ) ,
) ,
)
}
2024-01-27 18:41:03 +01:00
return nil
} , initialErr
}