gosh: Create a new process group for each child

Create a new process group for each child process. This allows us to
send signals to the child process and any other processes in the same
process group, which typically includes the other processes started by
child process.

Use the process group to clean up after the child process exits.

MultiPart: 1/2

Change-Id: Iad3daab102a29144d36d5d1f0e153b7df233f459
diff --git a/gosh/cmd.go b/gosh/cmd.go
index e7b7e22..e7b8b7a 100644
--- a/gosh/cmd.go
+++ b/gosh/cmd.go
@@ -78,6 +78,8 @@
 	stdinDoneChan     chan error
 	started           bool // protected by sh.cleanupMu
 	exited            bool // protected by cond.L
+	calledCleanup     bool // protected by cleanupMu
+	cleanupMu         sync.Mutex
 	stdoutHeadTail    *headTail
 	stderrHeadTail    *headTail
 	stdoutWriters     []io.Writer
@@ -616,6 +618,12 @@
 		return err
 	}
 	c.c.ExtraFiles = c.ExtraFiles
+	// Create a new process group for the child.
+	if c.c.SysProcAttr == nil {
+		c.c.SysProcAttr = &syscall.SysProcAttr{}
+	}
+	c.c.SysProcAttr.Setpgid = true
+	c.c.SysProcAttr.Pgid = 0
 	// Start the command.
 	if err = c.c.Start(); err != nil {
 		return err
@@ -646,6 +654,7 @@
 			}
 		}
 		c.waitChan <- waitErr
+		c.cleanupProcessGroup()
 	}()
 }
 
@@ -732,6 +741,32 @@
 	return nil
 }
 
+func (c *Cmd) cleanupProcessGroup() {
+	if !c.started {
+		return
+	}
+	c.cleanupMu.Lock()
+	defer c.cleanupMu.Unlock()
+
+	if c.calledCleanup {
+		return
+	}
+	c.calledCleanup = true
+
+	// Send SIGINT first; then, after a grace period, send SIGKILL to any
+	// process that is still running.
+	if err := syscall.Kill(-c.Pid(), syscall.SIGINT); err == syscall.ESRCH {
+		return
+	}
+	for i := 0; i < 10; i++ {
+		time.Sleep(100 * time.Millisecond)
+		if err := syscall.Kill(-c.Pid(), 0); err == syscall.ESRCH {
+			return
+		}
+	}
+	syscall.Kill(-c.Pid(), syscall.SIGKILL)
+}
+
 func (c *Cmd) terminate(sig os.Signal) error {
 	if err := c.signal(sig); err != nil {
 		return err
diff --git a/gosh/shell.go b/gosh/shell.go
index 3bab48d..9d6dd1d 100644
--- a/gosh/shell.go
+++ b/gosh/shell.go
@@ -471,55 +471,29 @@
 	return nil
 }
 
-// forEachRunningCmd applies f to each running child process.
-func (sh *Shell) forEachRunningCmd(f func(*Cmd)) bool {
-	anyRunning := false
+// Note: It is safe to run Shell.cleanupRunningCmds concurrently with the waiter
+// goroutine and with Cmd.wait. In particular, Shell.cleanupRunningCmds only
+// calls c.{isRunning,Pid}, all of which are thread-safe with the waiter
+// goroutine and with Cmd.wait.
+func (sh *Shell) cleanupRunningCmds() {
+	var wg sync.WaitGroup
 	for _, c := range sh.cmds {
-		if c.isRunning() {
-			anyRunning = true
-			if f != nil {
-				f(c)
-			}
+		if !c.started {
+			continue
 		}
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			c.cleanupProcessGroup()
+		}()
 	}
-	return anyRunning
-}
-
-// Note: It is safe to run Shell.terminateRunningCmds concurrently with the
-// waiter goroutine and with Cmd.wait. In particular, Shell.terminateRunningCmds
-// only calls c.{isRunning,Pid,signal}, all of which are thread-safe with the
-// waiter goroutine and with Cmd.wait.
-func (sh *Shell) terminateRunningCmds() {
-	// Send os.Interrupt first; if that doesn't work, send os.Kill.
-	anyRunning := sh.forEachRunningCmd(func(c *Cmd) {
-		if err := c.signal(os.Interrupt); err != nil {
-			sh.tb.Logf("%d.Signal(os.Interrupt) failed: %v\n", c.Pid(), err)
-		}
-	})
-	// If any child is still running, wait for 100ms.
-	if anyRunning {
-		time.Sleep(100 * time.Millisecond)
-		anyRunning = sh.forEachRunningCmd(func(c *Cmd) {
-			sh.tb.Logf("%s (PID %d) did not die\n", c.Path, c.Pid())
-		})
-	}
-	// If any child is still running, wait for another second, then send os.Kill
-	// to all running children.
-	if anyRunning {
-		time.Sleep(time.Second)
-		sh.forEachRunningCmd(func(c *Cmd) {
-			if err := c.signal(os.Kill); err != nil {
-				sh.tb.Logf("%d.Signal(os.Kill) failed: %v\n", c.Pid(), err)
-			}
-		})
-		sh.tb.Logf("Killed all remaining child processes\n")
-	}
+	wg.Wait()
 }
 
 func (sh *Shell) cleanup() {
 	sh.calledCleanup = true
-	// Terminate all children that are still running.
-	sh.terminateRunningCmds()
+	// Clean up all children that are still running.
+	sh.cleanupRunningCmds()
 	// Close and delete all temporary files.
 	for _, tempFile := range sh.tempFiles {
 		name := tempFile.Name()
diff --git a/gosh/shell_test.go b/gosh/shell_test.go
index 74fea0b..3c157c5 100644
--- a/gosh/shell_test.go
+++ b/gosh/shell_test.go
@@ -18,12 +18,14 @@
 	"io"
 	"io/ioutil"
 	"os"
+	"os/exec"
 	"os/signal"
 	"path/filepath"
 	"reflect"
 	"runtime"
 	"runtime/debug"
 	"strings"
+	"syscall"
 	"testing"
 	"time"
 
@@ -870,6 +872,46 @@
 	setsErr(t, sh, func() { c.Signal(os.Interrupt) })
 }
 
+var processGroup = gosh.RegisterFunc("processGroup", func(dir string, n int) {
+	for x := 0; x < n; x++ {
+		c := exec.Command("bash", "-c", fmt.Sprintf("trap 'echo > log.%d; exit' INT; echo READY; sleep 60", x))
+		c.Dir = dir
+		out, err := c.StdoutPipe()
+		if err != nil {
+			panic(err)
+		}
+		c.Start()
+		bufio.NewReader(out).ReadString('\n')
+	}
+	gosh.SendVars(map[string]string{"ready": ""})
+	time.Sleep(time.Minute)
+})
+
+func TestCleanupProcessGroup(t *testing.T) {
+	sh := gosh.NewShell(t)
+	defer sh.Cleanup()
+
+	workdir := sh.MakeTempDir()
+	const N = 5
+	c := sh.FuncCmd(processGroup, workdir, N)
+	c.Start()
+	c.AwaitVars("ready")
+	c.Signal(os.Interrupt)
+
+	// Wait for the process group to be gone.
+	for syscall.Kill(-c.Pid(), 0) != syscall.ESRCH {
+		time.Sleep(100 * time.Millisecond)
+	}
+	count := 0
+	for x := 0; x < N; x++ {
+		f := filepath.Join(workdir, fmt.Sprintf("log.%d", x))
+		if _, err := os.Stat(f); err == nil {
+			count++
+		}
+	}
+	eq(t, count, N)
+}
+
 func TestTerminate(t *testing.T) {
 	sh := gosh.NewShell(t)
 	defer sh.Cleanup()