lib: gosh: "fix" the wait-signal race
Adjusts the gosh.Cmd API to deal with the race:
- Adds Cmd.{Signal,Kill} methods that handle the race
- Replaces Cmd.Process with Cmd.Pid, since clients should
not do anything else with the Process handle
The race-handling logic hardcodes a particular internal
error message of the os package ("os: process already
finished"), which is brittle but seems more or less
unavoidable.
MultiPart: 1/2
Change-Id: I9cfb25bb09a3b88a69d596c092d1797d83bf3177
diff --git a/gosh/.api b/gosh/.api
index 472dccb..f20a7a5 100644
--- a/gosh/.api
+++ b/gosh/.api
@@ -13,9 +13,11 @@
pkg gosh, method (*Cmd) AwaitReady()
pkg gosh, method (*Cmd) AwaitVars(...string) map[string]string
pkg gosh, method (*Cmd) Clone() *Cmd
-pkg gosh, method (*Cmd) Process() *os.Process
+pkg gosh, method (*Cmd) Kill()
+pkg gosh, method (*Cmd) Pid() int
pkg gosh, method (*Cmd) Run()
pkg gosh, method (*Cmd) Shutdown(os.Signal)
+pkg gosh, method (*Cmd) Signal(os.Signal)
pkg gosh, method (*Cmd) Start()
pkg gosh, method (*Cmd) StderrPipe() io.Reader
pkg gosh, method (*Cmd) StdinPipe() io.WriteCloser
diff --git a/gosh/cmd.go b/gosh/cmd.go
index 9c69d01..8678d24 100644
--- a/gosh/cmd.go
+++ b/gosh/cmd.go
@@ -146,10 +146,19 @@
c.handleError(c.wait())
}
-// TODO(sadovsky): Maybe add a method to send SIGINT, wait for a bit, then send
-// SIGKILL if the process hasn't exited.
+// Kill causes the process to exit immediately.
+func (c *Cmd) Kill() {
+ c.sh.Ok()
+ c.handleError(c.kill())
+}
-// Shutdown sends the given signal to the command, then waits for it to exit.
+// Signal sends a signal to the process.
+func (c *Cmd) Signal(sig os.Signal) {
+ c.sh.Ok()
+ c.handleError(c.signal(sig))
+}
+
+// Shutdown sends a signal to the process, then waits for it to exit.
func (c *Cmd) Shutdown(sig os.Signal) {
c.sh.Ok()
c.handleError(c.shutdown(sig))
@@ -178,12 +187,12 @@
return stdout, stderr
}
-// Process returns the underlying process handle for the command.
-func (c *Cmd) Process() *os.Process {
- c.sh.Ok()
- res, err := c.process()
- c.handleError(err)
- return res
+// Pid returns the command's PID, or -1 if the command has not been started.
+func (c *Cmd) Pid() int {
+ if !c.started {
+ return -1
+ }
+ return c.c.Process.Pid
}
////////////////////////////////////////
@@ -522,19 +531,40 @@
return <-c.waitChan
}
-func (c *Cmd) shutdown(sig os.Signal) error {
+// Note: We check for this particular error message to handle the unavoidable
+// race between sending a signal to a process and the process exiting.
+// https://golang.org/src/os/exec_unix.go
+// https://golang.org/src/os/exec_windows.go
+const errFinished = "os: process already finished"
+
+func (c *Cmd) kill() error {
if !c.started {
return errDidNotCallStart
}
- // TODO(sadovsky): There's a race condition here and in
- // Shell.terminateRunningCmds. If our Process.Wait returns immediately before
- // we call Process.Signal, Process.Signal will return an error, "os: process
- // already finished". Should we add Cmd.Signal and Cmd.Kill methods that
- // special-case for this error message?
if !c.isRunning() {
return nil
}
- if err := c.c.Process.Signal(sig); err != nil {
+ if err := c.c.Process.Kill(); err != nil && err.Error() != errFinished {
+ return err
+ }
+ return nil
+}
+
+func (c *Cmd) signal(sig os.Signal) error {
+ if !c.started {
+ return errDidNotCallStart
+ }
+ if !c.isRunning() {
+ return nil
+ }
+ if err := c.c.Process.Signal(sig); err != nil && err.Error() != errFinished {
+ return err
+ }
+ return nil
+}
+
+func (c *Cmd) shutdown(sig os.Signal) error {
+ if err := c.signal(sig); err != nil {
return err
}
if err := c.wait(); err != nil {
@@ -572,10 +602,3 @@
err := c.run()
return stdout.String(), stderr.String(), err
}
-
-func (c *Cmd) process() (*os.Process, error) {
- if !c.started {
- return nil, errDidNotCallStart
- }
- return c.c.Process, nil
-}
diff --git a/gosh/shell.go b/gosh/shell.go
index 04705ee..79992cd 100644
--- a/gosh/shell.go
+++ b/gosh/shell.go
@@ -339,7 +339,7 @@
continue
}
if err := c.wait(); !c.errorIsOk(err) {
- sh.logf("%s (PID %d) failed: %v\n", c.Path, c.c.Process.Pid, err)
+ sh.logf("%s (PID %d) failed: %v\n", c.Path, c.Pid(), err)
res = err
}
}
@@ -468,33 +468,32 @@
// Note: It is safe to run Shell.terminateRunningCmds concurrently with the
// waiter goroutine and with Cmd.wait. In particular, Shell.terminateRunningCmds
-// only reads c.c.Process.Pid and calls c.c.Process.{Signal,Kill}, all of which
-// are thread-safe with the waiter goroutine and with Cmd.wait.
-// TODO(sadovsky): Not quite true. See TODO in Cmd.shutdown.
+// only calls c.{isRunning,Pid,Signal,Kill}, all of which are thread-safe with
+// the waiter goroutine and with Cmd.wait.
func (sh *Shell) terminateRunningCmds() {
- // Try SIGINT first; if that doesn't work, use SIGKILL.
+ // Try Cmd.signal first; if that doesn't work, use Cmd.kill.
anyRunning := sh.forEachRunningCmd(func(c *Cmd) {
- if err := c.c.Process.Signal(os.Interrupt); err != nil {
- sh.logf("%d.Signal(os.Interrupt) failed: %v\n", c.c.Process.Pid, err)
+ if err := c.signal(os.Interrupt); err != nil {
+ sh.logf("%d.Signal(os.Interrupt) failed: %v\n", c.Pid(), err)
}
})
// If any child is still running, wait for 100ms.
if anyRunning {
time.Sleep(100 * time.Millisecond)
anyRunning = sh.forEachRunningCmd(func(c *Cmd) {
- sh.logf("%s (PID %d) did not die\n", c.Path, c.c.Process.Pid)
+ sh.logf("%s (PID %d) did not die\n", c.Path, c.Pid())
})
}
- // If any child is still running, wait for another second, then send SIGKILL
- // to all running children.
+ // If any child is still running, wait for another second, then call Cmd.kill
+ // for all running children.
if anyRunning {
time.Sleep(time.Second)
sh.forEachRunningCmd(func(c *Cmd) {
- if err := c.c.Process.Kill(); err != nil {
- sh.logf("%d.Kill() failed: %v\n", c.c.Process.Pid, err)
+ if err := c.kill(); err != nil {
+ sh.logf("%d.Kill() failed: %v\n", c.Pid(), err)
}
})
- sh.logf("Sent SIGKILL to all remaining child processes\n")
+ sh.logf("Killed all remaining child processes\n")
}
}
diff --git a/gosh/shell_test.go b/gosh/shell_test.go
index a6df636..8c25904 100644
--- a/gosh/shell_test.go
+++ b/gosh/shell_test.go
@@ -463,9 +463,8 @@
c := sh.Fn(sleepFn, d, 0)
c.Start()
// Wait for a bit to allow the zero-sleep commands to exit, to test that
- // Shutdown succeeds for an exited process and to avoid the race condition
- // in Cmd.Shutdown.
- time.Sleep(10 * time.Millisecond)
+ // Shutdown succeeds for an exited process.
+ time.Sleep(100 * time.Millisecond)
c.Shutdown(s)
}
}
@@ -500,10 +499,12 @@
sh.Err = nil
sh.Opts.Fatalf = makeFatalf(t)
- // Start commands, call wait.
+ // Start commands, then wait for them to exit.
for _, c := range []*gosh.Cmd{c2, c3, c4, c5, c6, c7} {
c.Start()
}
+ // Wait for a bit to allow the zero-sleep commands to exit.
+ time.Sleep(100 * time.Millisecond)
c5.Wait()
c7.Wait()
sh.Wait()