containers · openshift-merge-bot · May 13, 2024 · May 8, 2024 · mheon · May 8, 2024
diff --git a/libpod/container_internal.go b/libpod/container_internal.go
@@ -1326,10 +1326,8 @@ func (c *Container) start(ctx context.Context) error {
 	return nil
 }
 
-// Internal, non-locking function to stop container
-func (c *Container) stop(timeout uint) error {
-	logrus.Debugf("Stopping ctr %s (timeout %d)", c.ID(), timeout)
-
+// Whether a container should use `all` when stopping
+func (c *Container) stopWithAll() (bool, error) {
 	// If the container is running in a PID Namespace, then killing the
 	// primary pid is enough to kill the container.  If it is not running in
 	// a pid namespace then the OCI Runtime needs to kill ALL processes in
@@ -1345,14 +1343,26 @@ func (c *Container) stop(timeout uint) error {
 			// Only do this check if we need to
 			unified, err := cgroups.IsCgroup2UnifiedMode()
 			if err != nil {
-				return err
+				return false, err
 			}
 			if !unified {
 				all = false
 			}
 		}
 	}
 
+	return all, nil
+}
+
+// Internal, non-locking function to stop container
+func (c *Container) stop(timeout uint) error {
+	logrus.Debugf("Stopping ctr %s (timeout %d)", c.ID(), timeout)
+
+	all, err := c.stopWithAll()
+	if err != nil {
+		return err
+	}
+
 	// OK, the following code looks a bit weird but we have to make sure we can stop
 	// containers with the restart policy always, to do this we have to set
 	// StoppedByUser even when there is nothing to stop right now. This is due to the
@@ -1442,6 +1452,58 @@ func (c *Container) waitForConmonToExitAndSave() error {
 			return err
 		}
 
+		// If we are still ContainerStateStopping, conmon exited without
+		// creating an exit file. Let's try and handle that here.
+		if c.state.State == define.ContainerStateStopping {
+			// Is container PID1 still alive?
+			if err := unix.Kill(c.state.PID, 0); err == nil {
+				// We have a runaway container, unmanaged by
+				// Conmon. Invoke OCI runtime stop.
+				// Use 0 timeout for immediate SIGKILL as things
+				// have gone seriously wrong.
+				// Ignore the error from stopWithAll, it's just
+				// a cgroup check - more important that we
+				// continue.
+				// If we wanted to be really fancy here, we
+				// could open a pidfd on container PID1 before
+				// this to get the real exit code... But I'm not
+				// that dedicated.
+				all, _ := c.stopWithAll()
+				if err := c.ociRuntime.StopContainer(c, 0, all); err != nil {
+					logrus.Errorf("Error stopping container %s after Conmon exited prematurely: %v", c.ID(), err)
+				}
+			}
+
+			// Conmon is dead. Handle it.
+			c.state.State = define.ContainerStateStopped
+			c.state.PID = 0
+			c.state.ConmonPID = 0
+			c.state.FinishedTime = time.Now()
+			c.state.ExitCode = -1
+			c.state.Exited = true
+
+			c.state.Error = "conmon died without writing exit file, container exit code could not be retrieved"
+
+			c.newContainerExitedEvent(c.state.ExitCode)
+
+			if err := c.save(); err != nil {
+				logrus.Errorf("Error saving container %s state after Conmon exited prematurely: %v", c.ID(), err)
+			}
+
+			if err := c.runtime.state.AddContainerExitCode(c.ID(), c.state.ExitCode); err != nil {
+				logrus.Errorf("Error saving container %s exit code after Conmon exited prematurely: %v", c.ID(), err)
+			}
+
+			// No Conmon alive to trigger cleanup, and the calls in
+			// regular Podman are conditional on no errors.
+			// Need to clean up manually.
+			if err := c.cleanup(context.Background()); err != nil {
+				logrus.Errorf("Error cleaning up container %s after Conmon exited prematurely: %v", c.ID(), err)
+			}
+
+			return fmt.Errorf("container %s conmon exited prematurely, exit code could not be retrieved: %w", c.ID(), define.ErrInternal)
+		}
+
 		return c.save()
 	}
 

diff --git a/test/system/030-run.bats b/test/system/030-run.bats
@@ -1465,4 +1465,44 @@ search               | $IMAGE           |
     is "$output" "Error.*: $expect" "podman emits useful diagnostic when no entrypoint is set"
 }
 
+@test "podman run - stopping loop" {
+    skip_if_remote "this doesn't work with with the REST service"
+
+    run_podman run -d --name testctr --stop-timeout 240 $IMAGE sh -c 'echo READY; sleep 999'
+    cid="$output"
+    wait_for_ready testctr
+
+    run_podman inspect --format '{{ .State.ConmonPid }}' testctr
+    conmon_pid=$output
+
+    ${PODMAN} stop testctr &
+    stop_pid=$!
+
+    timeout=20
+    while :;do
+        sleep 0.5
+        run_podman container inspect --format '{{.State.Status}}' testctr
+        if [[ "$output" = "stopping" ]]; then
+            break
+        fi
+        timeout=$((timeout - 1))
+        if [[ $timeout == 0 ]]; then
+            run_podman ps -a
+            die "Timed out waiting for testctr to acknowledge signal"
+        fi
+    done
+
+    kill -9 ${stop_pid}
+    kill -9 ${conmon_pid}
+
+    # Unclear why `-t0` is required here, works locally without.
+    # But it shouldn't hurt and does make the test pass...
+    PODMAN_TIMEOUT=5 run_podman 125 stop -t0 testctr
+    is "$output" "Error: container .* conmon exited prematurely, exit code could not be retrieved: internal libpod error" "correct error on missing conmon"
+
+    # This should be safe because stop is guaranteed to call cleanup?
+    run_podman inspect --format "{{ .State.Status }}" testctr
+    is "$output" "exited" "container has successfully transitioned to exited state after stop"
+}
+
 # vim: filetype=sh