72 lines
2.9 KiB
Diff
72 lines
2.9 KiB
Diff
From 2a6d6321956d0a6edbec7421357e14c01dc7f5ab Mon Sep 17 00:00:00 2001
|
|
From: chenjiankun <chenjiankun1@huawei.com>
|
|
Date: Fri, 13 Jan 2023 17:13:22 +0800
|
|
Subject: [PATCH] docker: set freezer.state to Thawed to increase freeze
|
|
chances
|
|
|
|
docker pause/unpause with parallel docker exec can lead to freezing
|
|
state, set freezer.state to Thawed to increase freeze chances
|
|
|
|
Occasional short sleep before reading the state back also improves
|
|
the chances to succeed in freezing in case of a very slow system.
|
|
---
|
|
components/engine/daemon/freezer/freezer.go | 28 +++++++++++++++++++++
|
|
1 file changed, 28 insertions(+)
|
|
|
|
diff --git a/components/engine/daemon/freezer/freezer.go b/components/engine/daemon/freezer/freezer.go
|
|
index 6df176f2f..bd45304f4 100644
|
|
--- a/components/engine/daemon/freezer/freezer.go
|
|
+++ b/components/engine/daemon/freezer/freezer.go
|
|
@@ -186,6 +186,7 @@ func (f *freezer) updateCgroup(state string) error {
|
|
timeout := time.After(30 * time.Second)
|
|
ticker := time.NewTicker(1 * time.Millisecond)
|
|
defer ticker.Stop()
|
|
+ count := 0
|
|
for {
|
|
select {
|
|
case <-timeout:
|
|
@@ -194,6 +195,26 @@ func (f *freezer) updateCgroup(state string) error {
|
|
}
|
|
return fmt.Errorf("update freezer cgroup timeout for 30s")
|
|
case <-ticker.C:
|
|
+ // As per older kernel docs (freezer-subsystem.txt before
|
|
+ // kernel commit ef9fe980c6fcc1821), if FREEZING is seen,
|
|
+ // userspace should either retry or thaw. While current
|
|
+ // kernel cgroup v1 docs no longer mention a need to retry,
|
|
+ // the kernel (tested on v5.4, Ubuntu 20.04) can't reliably
|
|
+ // freeze a cgroup while new processes keep appearing in it
|
|
+ // (either via fork/clone or by writing new PIDs to
|
|
+ // cgroup.procs).
|
|
+ //
|
|
+ // The numbers below are chosen to have a decent chance to
|
|
+ // succeed even in the worst case scenario (docker pause/unpause
|
|
+ // with parallel docker exec).
|
|
+ //
|
|
+ // Adding any amount of sleep in between retries did not
|
|
+ // increase the chances of successful freeze.
|
|
+ if count++; count % 50 == 0 && state == string(configs.Frozen) {
|
|
+ writeFile(f.path, "freezer.state", string(configs.Thawed))
|
|
+ time.Sleep(10 * time.Millisecond)
|
|
+ }
|
|
+
|
|
// In case this loop does not exit because it doesn't get the expected
|
|
// state, let's write again this state, hoping it's going to be properly
|
|
// set this time. Otherwise, this loop could run infinitely, waiting for
|
|
@@ -201,6 +222,13 @@ func (f *freezer) updateCgroup(state string) error {
|
|
if err := writeFile(f.path, "freezer.state", state); err != nil {
|
|
return fmt.Errorf("cannot write freezer.state for %#v", err)
|
|
}
|
|
+ if count%25 == 24 {
|
|
+ // Occasional short sleep before reading
|
|
+ // the state back also improves the chances to
|
|
+ // succeed in freezing in case of a very slow
|
|
+ // system.
|
|
+ time.Sleep(10 * time.Microsecond)
|
|
+ }
|
|
newState, err := readFile(f.path, "freezer.state")
|
|
if err != nil {
|
|
return fmt.Errorf("read freezer.state failed after write: %v", err)
|
|
--
|
|
2.33.0
|
|
|