lib: gosh: "fix" the wait-signal race

Adjusts the gosh.Cmd API to deal with the race:
- Adds Cmd.{Signal,Kill} methods that handle the race
- Replaces Cmd.Process with Cmd.Pid, since clients should
  not do anything else with the Process handle

The race-handling logic hardcodes a particular internal
error message of the os package ("os: process already
finished"), which is brittle but seems more or less
unavoidable.

MultiPart: 1/2
Change-Id: I9cfb25bb09a3b88a69d596c092d1797d83bf3177
diff --git a/gosh/.api b/gosh/.api
index 472dccb..f20a7a5 100644
--- a/gosh/.api
+++ b/gosh/.api
@@ -13,9 +13,11 @@
 pkg gosh, method (*Cmd) AwaitReady()
 pkg gosh, method (*Cmd) AwaitVars(...string) map[string]string
 pkg gosh, method (*Cmd) Clone() *Cmd
-pkg gosh, method (*Cmd) Process() *os.Process
+pkg gosh, method (*Cmd) Kill()
+pkg gosh, method (*Cmd) Pid() int
 pkg gosh, method (*Cmd) Run()
 pkg gosh, method (*Cmd) Shutdown(os.Signal)
+pkg gosh, method (*Cmd) Signal(os.Signal)
 pkg gosh, method (*Cmd) Start()
 pkg gosh, method (*Cmd) StderrPipe() io.Reader
 pkg gosh, method (*Cmd) StdinPipe() io.WriteCloser
diff --git a/gosh/cmd.go b/gosh/cmd.go
index 9c69d01..8678d24 100644
--- a/gosh/cmd.go
+++ b/gosh/cmd.go
@@ -146,10 +146,19 @@
 	c.handleError(c.wait())
 }
 
-// TODO(sadovsky): Maybe add a method to send SIGINT, wait for a bit, then send
-// SIGKILL if the process hasn't exited.
+// Kill causes the process to exit immediately.
+func (c *Cmd) Kill() {
+	c.sh.Ok()
+	c.handleError(c.kill())
+}
 
-// Shutdown sends the given signal to the command, then waits for it to exit.
+// Signal sends a signal to the process.
+func (c *Cmd) Signal(sig os.Signal) {
+	c.sh.Ok()
+	c.handleError(c.signal(sig))
+}
+
+// Shutdown sends a signal to the process, then waits for it to exit.
 func (c *Cmd) Shutdown(sig os.Signal) {
 	c.sh.Ok()
 	c.handleError(c.shutdown(sig))
@@ -178,12 +187,12 @@
 	return stdout, stderr
 }
 
-// Process returns the underlying process handle for the command.
-func (c *Cmd) Process() *os.Process {
-	c.sh.Ok()
-	res, err := c.process()
-	c.handleError(err)
-	return res
+// Pid returns the command's PID, or -1 if the command has not been started.
+func (c *Cmd) Pid() int {
+	if !c.started {
+		return -1
+	}
+	return c.c.Process.Pid
 }
 
 ////////////////////////////////////////
@@ -522,19 +531,40 @@
 	return <-c.waitChan
 }
 
-func (c *Cmd) shutdown(sig os.Signal) error {
+// Note: We check for this particular error message to handle the unavoidable
+// race between sending a signal to a process and the process exiting.
+// https://golang.org/src/os/exec_unix.go
+// https://golang.org/src/os/exec_windows.go
+const errFinished = "os: process already finished"
+
+func (c *Cmd) kill() error {
 	if !c.started {
 		return errDidNotCallStart
 	}
-	// TODO(sadovsky): There's a race condition here and in
-	// Shell.terminateRunningCmds. If our Process.Wait returns immediately before
-	// we call Process.Signal, Process.Signal will return an error, "os: process
-	// already finished". Should we add Cmd.Signal and Cmd.Kill methods that
-	// special-case for this error message?
 	if !c.isRunning() {
 		return nil
 	}
-	if err := c.c.Process.Signal(sig); err != nil {
+	if err := c.c.Process.Kill(); err != nil && err.Error() != errFinished {
+		return err
+	}
+	return nil
+}
+
+func (c *Cmd) signal(sig os.Signal) error {
+	if !c.started {
+		return errDidNotCallStart
+	}
+	if !c.isRunning() {
+		return nil
+	}
+	if err := c.c.Process.Signal(sig); err != nil && err.Error() != errFinished {
+		return err
+	}
+	return nil
+}
+
+func (c *Cmd) shutdown(sig os.Signal) error {
+	if err := c.signal(sig); err != nil {
 		return err
 	}
 	if err := c.wait(); err != nil {
@@ -572,10 +602,3 @@
 	err := c.run()
 	return stdout.String(), stderr.String(), err
 }
-
-func (c *Cmd) process() (*os.Process, error) {
-	if !c.started {
-		return nil, errDidNotCallStart
-	}
-	return c.c.Process, nil
-}
diff --git a/gosh/shell.go b/gosh/shell.go
index 04705ee..79992cd 100644
--- a/gosh/shell.go
+++ b/gosh/shell.go
@@ -339,7 +339,7 @@
 			continue
 		}
 		if err := c.wait(); !c.errorIsOk(err) {
-			sh.logf("%s (PID %d) failed: %v\n", c.Path, c.c.Process.Pid, err)
+			sh.logf("%s (PID %d) failed: %v\n", c.Path, c.Pid(), err)
 			res = err
 		}
 	}
@@ -468,33 +468,32 @@
 
 // Note: It is safe to run Shell.terminateRunningCmds concurrently with the
 // waiter goroutine and with Cmd.wait. In particular, Shell.terminateRunningCmds
-// only reads c.c.Process.Pid and calls c.c.Process.{Signal,Kill}, all of which
-// are thread-safe with the waiter goroutine and with Cmd.wait.
-// TODO(sadovsky): Not quite true. See TODO in Cmd.shutdown.
+// only calls c.{isRunning,Pid,Signal,Kill}, all of which are thread-safe with
+// the waiter goroutine and with Cmd.wait.
 func (sh *Shell) terminateRunningCmds() {
-	// Try SIGINT first; if that doesn't work, use SIGKILL.
+	// Try Cmd.signal first; if that doesn't work, use Cmd.kill.
 	anyRunning := sh.forEachRunningCmd(func(c *Cmd) {
-		if err := c.c.Process.Signal(os.Interrupt); err != nil {
-			sh.logf("%d.Signal(os.Interrupt) failed: %v\n", c.c.Process.Pid, err)
+		if err := c.signal(os.Interrupt); err != nil {
+			sh.logf("%d.Signal(os.Interrupt) failed: %v\n", c.Pid(), err)
 		}
 	})
 	// If any child is still running, wait for 100ms.
 	if anyRunning {
 		time.Sleep(100 * time.Millisecond)
 		anyRunning = sh.forEachRunningCmd(func(c *Cmd) {
-			sh.logf("%s (PID %d) did not die\n", c.Path, c.c.Process.Pid)
+			sh.logf("%s (PID %d) did not die\n", c.Path, c.Pid())
 		})
 	}
-	// If any child is still running, wait for another second, then send SIGKILL
-	// to all running children.
+	// If any child is still running, wait for another second, then call Cmd.kill
+	// for all running children.
 	if anyRunning {
 		time.Sleep(time.Second)
 		sh.forEachRunningCmd(func(c *Cmd) {
-			if err := c.c.Process.Kill(); err != nil {
-				sh.logf("%d.Kill() failed: %v\n", c.c.Process.Pid, err)
+			if err := c.kill(); err != nil {
+				sh.logf("%d.Kill() failed: %v\n", c.Pid(), err)
 			}
 		})
-		sh.logf("Sent SIGKILL to all remaining child processes\n")
+		sh.logf("Killed all remaining child processes\n")
 	}
 }
 
diff --git a/gosh/shell_test.go b/gosh/shell_test.go
index a6df636..8c25904 100644
--- a/gosh/shell_test.go
+++ b/gosh/shell_test.go
@@ -463,9 +463,8 @@
 			c := sh.Fn(sleepFn, d, 0)
 			c.Start()
 			// Wait for a bit to allow the zero-sleep commands to exit, to test that
-			// Shutdown succeeds for an exited process and to avoid the race condition
-			// in Cmd.Shutdown.
-			time.Sleep(10 * time.Millisecond)
+			// Shutdown succeeds for an exited process.
+			time.Sleep(100 * time.Millisecond)
 			c.Shutdown(s)
 		}
 	}
@@ -500,10 +499,12 @@
 	sh.Err = nil
 	sh.Opts.Fatalf = makeFatalf(t)
 
-	// Start commands, call wait.
+	// Start commands, then wait for them to exit.
 	for _, c := range []*gosh.Cmd{c2, c3, c4, c5, c6, c7} {
 		c.Start()
 	}
+	// Wait for a bit to allow the zero-sleep commands to exit.
+	time.Sleep(100 * time.Millisecond)
 	c5.Wait()
 	c7.Wait()
 	sh.Wait()