veyron/lib/testutil/blackbox: various measures to prevent process leaks.

Add two measures to prevent child process from outliving their parents
in the case where the parent test crashes. This is a belt+braces approach
to ensure that in the normal case child processes will terminate immediately
when the parent does, but if for whatever reason the parent is hung,
the children will timeout anyway.
1. we don't use SIGHUP since it's often used for reloading configs and
by system launchers such as systemd and launchd.
2. we ensure that the go test timeout parameter is always set, aggressively,
for child processes
3. we open pipe to every child that the child issues blocking read
on and exits when it returns for any reason. Thus, if the parent dies
for whatever reason, the pipe will be closed and the child will terminate.

Change-Id: I38afa23d8ef7cbd477b337d34f1738d4afa22a65
diff --git a/lib/testutil/blackbox/subprocess.go b/lib/testutil/blackbox/subprocess.go
index db4d40c..ad37620 100644
--- a/lib/testutil/blackbox/subprocess.go
+++ b/lib/testutil/blackbox/subprocess.go
@@ -15,6 +15,7 @@
 	"testing"
 	"time"
 
+	vexec "veyron/runtimes/google/lib/exec"
 	"veyron2/vlog"
 )
 
@@ -39,12 +40,79 @@
 	Stdout *bufio.Reader // GUARDED_BY(ioLock)
 	Stdin  io.WriteCloser
 	stderr *os.File
+	closer *os.File
 	t      *testing.T
 }
 
+func testTimeout() string {
+	if timeout := flag.Lookup("test.timeout"); timeout != nil {
+		val := timeout.Value.(flag.Getter).Get().(time.Duration)
+		if val.String() != timeout.DefValue {
+			return "--test.timeout=" + timeout.Value.String()
+		}
+	}
+	return "--test.timeout=1m"
+}
+
+// BlackboxTest returns true if the environment variables passed to
+// it contain the variable (GO_WANT_HELPERP_PROCESS=1) indicating that
+// a blackbox test is configured.
+func BlackboxTest(env []string) bool {
+	for _, v := range env {
+		if v == "GO_WANT_HELPER_PROCESS=1" {
+			return true
+		}
+	}
+	return false
+}
+
+type pipeList struct {
+	sync.Mutex
+	writers []*os.File
+}
+
+var pipes pipeList
+
+// InitBlacboxParent initializes the exec.Command instance passed in for
+// use with a process that is is be run as blackbox test. This is needed
+// for processes, such as the node manager, which want to run subprocesses
+// from within blackbox tests but are not themselves test code.
+// It must be called before any modifications are made to ExtraFiles
+// and before the subprocess is started.
+func InitBlackboxParent(cmd *exec.Cmd) error {
+	writer, err := initBlackboxParent(cmd)
+	if err != nil {
+		return err
+	}
+	// Keep a reference to the writers to prevent GC from closing them
+	// and thus causing the child to terminate.
+	pipes.Lock()
+	pipes.writers = append(pipes.writers, writer)
+	pipes.Unlock()
+	return nil
+}
+
+func initBlackboxParent(cmd *exec.Cmd) (*os.File, error) {
+	// We create a pipe that the child will read from, so that when the
+	// parent dies, it will receive an EOF and then immediately exit.
+	// This should hopefully ensure that all subprocesses die when the
+	// parent does.
+	reader, writer, err := os.Pipe()
+	if err != nil {
+		return nil, err
+
+	}
+	cmd.Env = append([]string{"GO_WANT_HELPER_PROCESS=1"}, os.Environ()...)
+	cmd.ExtraFiles = []*os.File{reader}
+	return writer, nil
+}
+
 // HelperCommand() takes an argument list and starts a helper subprocess.
 func HelperCommand(t *testing.T, command string, args ...string) *Child {
 	cs := []string{"-test.run=TestHelperProcess", "--subcommand=" + command}
+	// If timeout is not specified on the command line set a
+	// default value.
+	cs = append(cs, testTimeout())
 	for fname, fval := range vlog.Log.ExplicitlySetFlags() {
 		cs = append(cs, fmt.Sprintf("--%s=%s", fname, fval))
 	}
@@ -59,7 +127,14 @@
 	} else {
 		cmd.Stderr = stderr
 	}
-	cmd.Env = append([]string{"GO_WANT_HELPER_PROCESS=1"}, os.Environ()...)
+
+	writer, err := initBlackboxParent(cmd)
+	if err != nil {
+		t.Logf("Failed to create pipe: %v", err)
+		t.FailNow()
+		return nil
+	}
+
 	stdout, _ := cmd.StdoutPipe()
 	stdin, _ := cmd.StdinPipe()
 	return &Child{
@@ -68,6 +143,8 @@
 		Stdout: bufio.NewReader(stdout),
 		Stdin:  stdin,
 		stderr: stderr,
+		// Keep the writer around to prevent GC from closing it!
+		closer: writer,
 		t:      t,
 	}
 }
@@ -89,6 +166,29 @@
 	if !ok {
 		vlog.Fatalf("Unknown cmd: '%s'", subcommand)
 	}
+
+	// This is the closer pipe set up in HelperCommand above.
+	// The child spawns a go routine to read from it and exit on EOF.
+	var closer *os.File
+	if os.Getenv("VEYRON_EXEC_VERSION") != "" {
+		// If this process was started using the veyron exec library
+		// we have to use NewExtraFile below to get at the correct
+		// offset for the fd.
+		ch, err := vexec.NewChildHandle()
+		if err != nil {
+			vlog.Fatalf("Failed to init child handle: %v", err)
+		}
+		closer = ch.NewExtraFile(0, "closer")
+	} else {
+		closer = os.NewFile(3, "closer")
+	}
+	if _, err := closer.Stat(); err == nil {
+		go func(r io.Reader) {
+			buf := make([]byte, 10)
+			n, err := r.Read(buf)
+			vlog.Fatalf("Looks like our parent exited: closer pipe read done: %d bytes (%s): err %v\n", n, string(buf), err)
+		}(closer)
+	}
 	mainFunc(flag.Args())
 	os.Exit(0)
 }
@@ -452,6 +552,7 @@
 // the child and deletes the log files generated by the child.
 func (c *Child) Cleanup() {
 	vlog.FlushLog()
+	defer c.closer.Close()
 	if c.Cmd.Process != nil {
 		c.Cmd.Process.Kill()
 	}
@@ -469,4 +570,5 @@
 			vlog.Info(c.Name + ": " + scanner.Text())
 		}
 	}
+
 }
diff --git a/lib/testutil/blackbox/subprocess_test.go b/lib/testutil/blackbox/subprocess_test.go
index 8da6ace..5105346 100644
--- a/lib/testutil/blackbox/subprocess_test.go
+++ b/lib/testutil/blackbox/subprocess_test.go
@@ -4,7 +4,12 @@
 // called. We need to mock testing.T in order to be able to catch these.
 
 import (
+	"fmt"
 	"io"
+	"os"
+	"strconv"
+	"strings"
+	"syscall"
 	"testing"
 	"time"
 
@@ -18,7 +23,10 @@
 	}
 }
 
+var globalT *testing.T
+
 func TestHelperProcess(t *testing.T) {
+	globalT = t
 	blackbox.HelperProcess(t)
 }
 
@@ -126,3 +134,120 @@
 	c.CloseStdin()
 	c.WaitForEOF(time.Second)
 }
+
+func init() {
+	blackbox.CommandTable["sleepChild"] = sleepChild
+	blackbox.CommandTable["sleepParent"] = sleepParent
+}
+
+// SleepChild spleeps essentially forever, this process should only
+// die when its parent dies and it will exit via the
+// blackbox framework and not here.
+func sleepChild(args []string) {
+	fmt.Printf("%d\n", os.Getpid())
+	time.Sleep(100 * time.Hour)
+	panic("this test should never be allowed to run this long!")
+}
+
+// Spawn the sleepChild process above.
+func spawnSleeper() *blackbox.Child {
+	c := blackbox.HelperCommand(globalT, "sleepChild")
+	c.Cmd.Start()
+	fmt.Println("ready")
+	line, err := c.ReadLineFromChild()
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "unexpected error: %v", err)
+		os.Exit(1)
+	}
+	// Print out the pid of the sleepChild first and ourselves second
+	fmt.Println(line)
+	fmt.Printf("%d\n", os.Getpid())
+	return c
+}
+
+// Spawn a sleeper and wait for our parent to tell us when to exit
+func sleepParent(args []string) {
+	c := spawnSleeper()
+	defer c.Cleanup()
+	blackbox.WaitForEOFOnStdin()
+}
+
+// Read the pids from the child processes and make sure they exist
+func readAndTestPids(t *testing.T, c *blackbox.Child) (child, parent int) {
+	// Read the pids of the sleeper and the process waiting on it
+	line, err := c.ReadLineFromChild()
+	isFatalf(t, err, "unexpected error: %v", err)
+	child, err = strconv.Atoi(strings.TrimRight(line, "\n"))
+	isFatalf(t, err, "unexpected error: %v", err)
+	line, err = c.ReadLineFromChild()
+	isFatalf(t, err, "unexpected error: %v", err)
+	parent, err = strconv.Atoi(strings.TrimRight(line, "\n"))
+	isFatalf(t, err, "unexpected error: %v", err)
+
+	// Make sure both subprocs exist
+	err = syscall.Kill(child, 0)
+	isFatalf(t, err, "unexpected error: %v", err)
+	err = syscall.Kill(parent, 0)
+	isFatalf(t, err, "unexpected error: %v", err)
+
+	return child, parent
+}
+
+// Wait for the specified pids to no longer exist
+func waitForNonExistence(t *testing.T, pids []int) {
+	w := make(chan struct{})
+	go func() {
+		for {
+			done := 0
+			for _, pid := range pids {
+				err := syscall.Kill(pid, 0)
+				if err == nil {
+					continue
+				}
+				done++
+			}
+			if done == len(pids) {
+				w <- struct{}{}
+				return
+			}
+			time.Sleep(250 * time.Millisecond)
+		}
+	}()
+
+	select {
+	case <-w:
+	case <-time.After(10 * time.Second):
+		t.Errorf("timed out on children")
+	}
+}
+
+func TestCloser(t *testing.T) {
+	c := blackbox.HelperCommand(t, "sleepParent")
+	defer c.Cleanup()
+	c.Cmd.Start()
+	c.Expect("ready")
+
+	sleeper, parent := readAndTestPids(t, c)
+
+	// Ask the sleep waiter to terminate and wait for it to do so.
+	c.CloseStdin()
+	c.ExpectEOFAndWait()
+
+	waitForNonExistence(t, []int{sleeper, parent})
+}
+
+func TestSubtimeout(t *testing.T) {
+	c := blackbox.HelperCommand(t, "sleepChild", "--test.timeout=2s")
+	defer c.Cleanup()
+	c.Cmd.Start()
+
+	line, err := c.ReadLineFromChild()
+	isFatalf(t, err, "unexpected error: %v", err)
+	child, err := strconv.Atoi(strings.TrimRight(line, "\n"))
+	err = syscall.Kill(child, 0)
+	isFatalf(t, err, "unexpected error: %v", err)
+
+	c.ExpectEOFAndWaitForExitCode(fmt.Errorf("exit status 2"))
+
+	waitForNonExistence(t, []int{child})
+}
diff --git a/runtimes/google/lib/exec/parent.go b/runtimes/google/lib/exec/parent.go
index 43de3e6..cba4a81 100644
--- a/runtimes/google/lib/exec/parent.go
+++ b/runtimes/google/lib/exec/parent.go
@@ -7,7 +7,9 @@
 	"sync"
 	"syscall"
 	"time"
+
 	"veyron/runtimes/google/lib/timekeeper"
+
 	"veyron2/vlog"
 )
 
diff --git a/services/mgmt/node/impl/invoker.go b/services/mgmt/node/impl/invoker.go
index 21874fc..c25af50 100644
--- a/services/mgmt/node/impl/invoker.go
+++ b/services/mgmt/node/impl/invoker.go
@@ -13,8 +13,10 @@
 	"strings"
 	"syscall"
 
+	"veyron/lib/testutil/blackbox"
 	vexec "veyron/runtimes/google/lib/exec"
 	ibuild "veyron/services/mgmt/build"
+
 	"veyron/services/mgmt/profile"
 	"veyron2/ipc"
 	"veyron2/services/mgmt/application"
@@ -356,6 +358,18 @@
 	cmd.Env = envelope.Env
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
+
+	// TODO(cnicolaou): figure out how to refactor the blackbox
+	// and exec packages so that this code can move into one or
+	// other of them without introducing a circular dependency.
+	if blackbox.BlackboxTest(cmd.Env) {
+		err := blackbox.InitBlackboxParent(cmd)
+		if err != nil {
+			vlog.Errorf("InitBlackboxParent() failed: %v", err)
+			return errOperationFailed
+		}
+	}
+
 	handle := vexec.NewParentHandle(cmd, "")
 	if err := handle.Start(); err != nil {
 		vlog.Errorf("Start() failed: %v", err)