services/device/internal/impl: additional app liveness mechanism Add another mechanism by which the device manager can determine if apps are running when it restarts. This new default mechanism assumes that the pid of process previously launched by the device manager that does not respond to app cycle inquiries is the selfsame process and so should remain in the device manger's purview. Change-Id: I5a4dd5c5168bebe0e845c71448b32a45e74ff1bf

commit: 9e10d4fc2b003869a9b42a3224d6d6c9027d11e8 [log] [tgz]
author: Robert Kroeger <rjkroege@google.com> Thu May 07 16:33:04 2015 -0700
committer: Robert Kroeger <rjkroege@google.com> Mon May 11 18:41:31 2015 -0700
tree: dc872d3622ea53e211308c7c3a367a26154f4ce8
parent: cef93575ecb021e7733b255f5600997e3f98457b [diff]
diff --git a/services/device/internal/impl/instance_reaping.go b/services/device/internal/impl/instance_reaping.go
index 9dc2b3e..59ae4b0 100644
--- a/services/device/internal/impl/instance_reaping.go
+++ b/services/device/internal/impl/instance_reaping.go

@@ -5,6 +5,7 @@
 package impl
 
 import (
+	"os"
 	"path/filepath"
 	"sync"
 	"syscall"
@@ -19,10 +20,24 @@
 	"v.io/x/lib/vlog"
 )
 
+const (
+	AppcycleReconciliation = "V23_APPCYCLE_RECONCILIATION"
+)
+
 var (
 	errPIDIsNotInteger = verror.Register(pkgPath+".errPIDIsNotInteger", verror.NoRetry, "{1:}{2:} __debug/stats/system/pid isn't an integer{:_}")
+
+	v23PIDMgmt = true
 )
 
+func init() {
+	// TODO(rjkroege): Environment variables do not survive device manager updates.
+	// Use an alternative mechanism.
+	if os.Getenv(AppcycleReconciliation) != "" {
+		v23PIDMgmt = false
+	}
+}
+
 type pidInstanceDirPair struct {
 	instanceDir string
 	pid         int
@@ -57,11 +72,11 @@
 	}
 }
 
-// processStatusPolling polls for the continued existence of a set of tracked
-// pids.
-// TODO(rjkroege): There are nicer ways to provide this functionality.
-// For example, use the kevent facility in darwin or replace init.
-// See http://www.incenp.org/dvlpt/wait4.html for inspiration.
+// processStatusPolling polls for the continued existence of a set of
+// tracked pids. TODO(rjkroege): There are nicer ways to provide this
+// functionality. For example, use the kevent facility in darwin or
+// replace init. See http://www.incenp.org/dvlpt/wait4.html for
+// inspiration.
 func processStatusPolling(r reaper, trackedPids map[string]int) {
 	poll := func() {
 		for idir, pid := range trackedPids {
@@ -81,8 +96,9 @@
 				// This implementation cannot detect if a process exited
 				// and was replaced by an arbitrary non-Vanadium process
 				// within the polling interval.
-				// TODO(rjkroege): Consider probing the appcycle service of
-				// the pid to confirm.
+				// TODO(rjkroege): Probe the appcycle service of the app
+				// to confirm that its pid is valid iff v23PIDMgmt
+				// is false.
 			default:
 				// The kill system call manpage says that this can only happen
 				// if the kernel claims that 0 is an invalid signal.
@@ -150,6 +166,79 @@
 // In seconds.
 const appCycleTimeout = 5
 
+// processStatusViaAppCycleMgr updates the status based on getting the
+// pid from the AppCycleMgr because the data in the instance info might
+// be outdated: the app may have exited and an arbitrary non-Vanadium
+// process may have been executed with the same pid.
+func processStatusViaAppCycleMgr(ctx *context.T, c chan<- pidErrorTuple, instancePath string, info *instanceInfo, state device.InstanceState) {
+	nctx, _ := context.WithTimeout(ctx, appCycleTimeout*time.Second)
+
+	name := naming.Join(info.AppCycleMgrName, "__debug/stats/system/pid")
+	sclient := stats.StatsClient(name)
+	v, err := sclient.Value(nctx)
+	if err != nil {
+		vlog.Infof("Instance: %v error: %v", instancePath, err)
+		// No process is actually running for this instance.
+		vlog.VI(2).Infof("perinstance stats fetching failed: %v", err)
+		if err := transitionInstance(instancePath, state, device.InstanceStateNotRunning); err != nil {
+			vlog.Errorf("transitionInstance(%s,%s,%s) failed: %v", instancePath, state, device.InstanceStateNotRunning, err)
+		}
+		c <- pidErrorTuple{ipath: instancePath, err: err}
+		return
+	}
+	// Convert the stat value from *vdl.Value into an int pid.
+	var pid int
+	if err := vdl.Convert(&pid, v); err != nil {
+		err = verror.New(errPIDIsNotInteger, ctx, err)
+		vlog.Errorf(err.Error())
+		c <- pidErrorTuple{ipath: instancePath, err: err}
+		return
+	}
+
+	ptuple := pidErrorTuple{ipath: instancePath, pid: pid}
+
+	// Update the instance info.
+	if info.Pid != pid {
+		info.Pid = pid
+		ptuple.err = saveInstanceInfo(ctx, instancePath, info)
+	}
+
+	// The instance was found to be running, so update its state accordingly
+	// (in case the device restarted while the instance was in one of the
+	// transitional states like launching, dying, etc).
+	if err := transitionInstance(instancePath, state, device.InstanceStateRunning); err != nil {
+		vlog.Errorf("transitionInstance(%s,%v,%s) failed: %v", instancePath, state, device.InstanceStateRunning, err)
+	}
+
+	vlog.VI(0).Infof("perInstance go routine for %v ending", instancePath)
+	c <- ptuple
+}
+
+// processStatusViaKill updates the status based on sending a kill signal
+// to the process. This assumes that most processes on the system are
+// likely to be managed by the device manager and a live process is not
+// responsive because the agent has been restarted rather than being
+// created through a different means.
+func processStatusViaKill(c chan<- pidErrorTuple, instancePath string, info *instanceInfo, state device.InstanceState) {
+	pid := info.Pid
+
+	switch err := syscall.Kill(pid, 0); err {
+	case syscall.ESRCH:
+		// No such PID.
+		if err := transitionInstance(instancePath, state, device.InstanceStateNotRunning); err != nil {
+			vlog.Errorf("transitionInstance(%s,%s,%s) failed: %v", instancePath, state, device.InstanceStateNotRunning, err)
+		}
+		c <- pidErrorTuple{ipath: instancePath, err: err, pid: pid}
+	case nil, syscall.EPERM:
+		// The instance was found to be running, so update its state.
+		if err := transitionInstance(instancePath, state, device.InstanceStateRunning); err != nil {
+			vlog.Errorf("transitionInstance(%s,%v, %v) failed: %v", instancePath, state, device.InstanceStateRunning, err)
+		}
+		vlog.VI(0).Infof("perInstance go routine for %v ending", instancePath)
+		c <- pidErrorTuple{ipath: instancePath, err: nil, pid: pid}
+	}
+}
+
 func perInstance(ctx *context.T, instancePath string, c chan<- pidErrorTuple, wg *sync.WaitGroup) {
 	defer wg.Done()
 	vlog.Infof("Instance: %v", instancePath)
@@ -169,65 +258,23 @@
 		return
 	}
 	vlog.VI(2).Infof("perInstance firing up on %s", instancePath)
-	nctx, _ := context.WithTimeout(ctx, appCycleTimeout*time.Second)
-
-	var ptuple pidErrorTuple
-	ptuple.ipath = instancePath
 
 	// Read the instance data.
 	info, err := loadInstanceInfo(ctx, instancePath)
 	if err != nil {
 		vlog.Errorf("loadInstanceInfo failed: %v", err)
-
 		// Something has gone badly wrong.
 		// TODO(rjkroege,caprita): Consider removing the instance or at
 		// least set its state to something indicating error?
-		ptuple.err = err
-		c <- ptuple
+		c <- pidErrorTuple{err: err, ipath: instancePath}
 		return
 	}
 
-	// Get the  pid from the AppCycleMgr because the data in the instance
-	// info might be outdated: the app may have exited and an arbitrary
-	// non-Vanadium process may have been executed with the same pid.
-	name := naming.Join(info.AppCycleMgrName, "__debug/stats/system/pid")
-	sclient := stats.StatsClient(name)
-	v, err := sclient.Value(nctx)
-	if err != nil {
-		vlog.Infof("Instance: %v error: %v", instancePath, err)
-		// No process is actually running for this instance.
-		vlog.VI(2).Infof("perinstance stats fetching failed: %v", err)
-		if err := transitionInstance(instancePath, state, device.InstanceStateNotRunning); err != nil {
-			vlog.Errorf("transitionInstance(%s,%s,%s) failed: %v", instancePath, state, device.InstanceStateNotRunning, err)
-		}
-		ptuple.err = err
-		c <- ptuple
+	if !v23PIDMgmt {
+		processStatusViaAppCycleMgr(ctx, c, instancePath, info, state)
 		return
 	}
-	// Convert the stat value from *vdl.Value into an int pid.
-	var pid int
-	if err := vdl.Convert(&pid, v); err != nil {
-		ptuple.err = verror.New(errPIDIsNotInteger, ctx, err)
-		vlog.Errorf(ptuple.err.Error())
-		c <- ptuple
-		return
-	}
-
-	ptuple.pid = pid
-	// Update the instance info.
-	if info.Pid != pid {
-		info.Pid = pid
-		ptuple.err = saveInstanceInfo(ctx, instancePath, info)
-	}
-	// The instance was found to be running, so update its state accordingly
-	// (in case the device restarted while the instance was in one of the
-	// transitional states like launching, dying, etc).
-	if err := transitionInstance(instancePath, state, device.InstanceStateRunning); err != nil {
-		vlog.Errorf("transitionInstance(%s,%v,%s) failed: %v", instancePath, state, device.InstanceStateRunning, err)
-	}
-
-	vlog.VI(0).Infof("perInstance go routine for %v ending", instancePath)
-	c <- ptuple
+	processStatusViaKill(c, instancePath, info, state)
 }
 
 // Digs through the directory hierarchy

diff --git a/services/device/internal/impl/instance_reaping_kill_test.go b/services/device/internal/impl/instance_reaping_kill_test.go
new file mode 100644
index 0000000..e702a4c
--- /dev/null
+++ b/services/device/internal/impl/instance_reaping_kill_test.go

@@ -0,0 +1,131 @@
+// Copyright 2015 The Vanadium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package impl_test
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"syscall"
+	"testing"
+	"time"
+
+	"v.io/v23/services/device"
+
+	"v.io/x/ref/envvar"
+	"v.io/x/ref/services/device/internal/impl/utiltest"
+	"v.io/x/ref/services/internal/servicetest"
+)
+
+func TestReapReconciliationViaKill(t *testing.T) {
+	cleanup, ctx, sh, envelope, root, helperPath, _ := utiltest.StartupHelper(t)
+	defer cleanup()
+
+	// Start a device manager.
+	// (Since it will be restarted, use the VeyronCredentials environment
+	// to maintain the same set of credentials across runs)
+	dmCreds, err := ioutil.TempDir("", "TestReapReconciliationViaKill")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer os.RemoveAll(dmCreds)
+	dmEnv := []string{fmt.Sprintf("%v=%v", envvar.Credentials, dmCreds)}
+
+	dmh := servicetest.RunCommand(t, sh, dmEnv, utiltest.DeviceManagerCmd, "dm", root, helperPath, "unused_app_repo_name", "unused_curr_link")
+	servicetest.ReadPID(t, dmh)
+	utiltest.ClaimDevice(t, ctx, "claimable", "dm", "mydevice", utiltest.NoPairingToken)
+
+	// Create the local server that the app uses to let us know it's ready.
+	pingCh, cleanup := utiltest.SetupPingServer(t, ctx)
+	defer cleanup()
+	utiltest.Resolve(t, ctx, "pingserver", 1)
+
+	// Create an envelope for the app.
+	*envelope = utiltest.EnvelopeFromShell(sh, nil, utiltest.AppCmd, "google naps", "appV1")
+
+	// Install the app.
+	appID := utiltest.InstallApp(t, ctx)
+
+	// Start three app instances.
+	instances := make([]string, 3)
+	for i, _ := range instances {
+		instances[i] = utiltest.LaunchApp(t, ctx, appID)
+		pingCh.VerifyPingArgs(t, utiltest.UserName(t), "default", "")
+	}
+
+	// Get pid of instance[0]
+	pid := utiltest.GetPid(t, ctx, appID, instances[0])
+
+	// Shutdown the first device manager.
+	syscall.Kill(dmh.Pid(), syscall.SIGINT)
+	dmh.Expect("dm terminated")
+	dmh.ExpectEOF()
+	dmh.Shutdown(os.Stderr, os.Stderr)
+	utiltest.ResolveExpectNotFound(t, ctx, "dm") // Ensure a clean slate.
+
+	// Kill instance[0] and wait until it exits before proceeding.
+	syscall.Kill(pid, 9)
+	timeOut := time.After(5 * time.Second)
+	for syscall.Kill(pid, 0) == nil {
+		select {
+		case <-timeOut:
+			t.Fatalf("Timed out waiting for PID %v to terminate", pid)
+		case <-time.After(time.Millisecond):
+			// Try again.
+		}
+	}
+
+	// Run another device manager to replace the dead one.
+	dmh = servicetest.RunCommand(t, sh, dmEnv, utiltest.DeviceManagerCmd, "dm", root, helperPath, "unused_app_repo_name", "unused_curr_link")
+	servicetest.ReadPID(t, dmh)
+	utiltest.Resolve(t, ctx, "dm", 1) // Verify the device manager has published itself.
+
+	// By now, we've reconciled the state of the tree with which processes
+	// are actually alive. instance-0 is not alive.
+	expected := []device.InstanceState{device.InstanceStateNotRunning, device.InstanceStateRunning, device.InstanceStateRunning}
+	for i, _ := range instances {
+		utiltest.VerifyState(t, ctx, expected[i], appID, instances[i])
+	}
+
+	// Start instance[0] over-again to show that an app marked not running
+	// by reconciliation can be restarted.
+	utiltest.RunApp(t, ctx, appID, instances[0])
+	pingCh.VerifyPingArgs(t, utiltest.UserName(t), "default", "")
+
+	// Kill instance[1]
+	pid = utiltest.GetPid(t, ctx, appID, instances[1])
+	syscall.Kill(pid, 9)
+
+	// Make a fourth instance. This forces a polling of processes so that
+	// the state is updated.
+	instances = append(instances, utiltest.LaunchApp(t, ctx, appID))
+	pingCh.VerifyPingArgs(t, utiltest.UserName(t), "default", "")
+
+	// Stop the fourth instance to make sure that there's no way we could
+	// still be running the polling loop before doing the below.
+	utiltest.TerminateApp(t, ctx, appID, instances[3])
+
+	// Verify that reaper picked up the previous instances and was watching
+	// instance[1]
+	expected = []device.InstanceState{device.InstanceStateRunning, device.InstanceStateNotRunning, device.InstanceStateRunning, device.InstanceStateDeleted}
+	for i, _ := range instances {
+		utiltest.VerifyState(t, ctx, expected[i], appID, instances[i])
+	}
+
+	utiltest.TerminateApp(t, ctx, appID, instances[2])
+
+	expected = []device.InstanceState{device.InstanceStateRunning, device.InstanceStateNotRunning, device.InstanceStateDeleted, device.InstanceStateDeleted}
+	for i, _ := range instances {
+		utiltest.VerifyState(t, ctx, expected[i], appID, instances[i])
+	}
+	utiltest.TerminateApp(t, ctx, appID, instances[0])
+
+	// TODO(rjkroege): Should be in a defer to ensure that the device
+	// manager is cleaned up even if the test fails in an exceptional way.
+	utiltest.VerifyNoRunningProcesses(t)
+	syscall.Kill(dmh.Pid(), syscall.SIGINT)
+	dmh.Expect("dm terminated")
+	dmh.ExpectEOF()
+}

diff --git a/services/device/internal/impl/reaping/instance_reaping_test.go b/services/device/internal/impl/reaping/instance_reaping_test.go
index 195799d..f3378db 100644
--- a/services/device/internal/impl/reaping/instance_reaping_test.go
+++ b/services/device/internal/impl/reaping/instance_reaping_test.go

@@ -12,27 +12,15 @@
 	"testing"
 	"time"
 
-	"v.io/v23/context"
-	"v.io/v23/naming"
 	"v.io/v23/services/device"
-	"v.io/v23/services/stats"
 
 	"v.io/x/ref/envvar"
+	"v.io/x/ref/services/device/internal/impl"
 	"v.io/x/ref/services/device/internal/impl/utiltest"
 	"v.io/x/ref/services/internal/servicetest"
 )
 
-func getPid(t *testing.T, ctx *context.T, appID, instanceID string) int {
-	name := naming.Join("dm", "apps/"+appID+"/"+instanceID+"/stats/system/pid")
-	c := stats.StatsClient(name)
-	v, err := c.Value(ctx)
-	if err != nil {
-		t.Fatalf("Value() failed: %v\n", err)
-	}
-	return int(v.Int())
-}
-
-func TestReapReconciliation(t *testing.T) {
+func TestReapReconciliationViaAppCycle(t *testing.T) {
 	cleanup, ctx, sh, envelope, root, helperPath, _ := utiltest.StartupHelper(t)
 	defer cleanup()
 
@@ -44,7 +32,7 @@
 		t.Fatal(err)
 	}
 	defer os.RemoveAll(dmCreds)
-	dmEnv := []string{fmt.Sprintf("%v=%v", envvar.Credentials, dmCreds)}
+	dmEnv := []string{fmt.Sprintf("%v=%v", envvar.Credentials, dmCreds), fmt.Sprintf("%v=%v", impl.AppcycleReconciliation, "1")}
 
 	dmh := servicetest.RunCommand(t, sh, dmEnv, utiltest.DeviceManagerCmd, "dm", root, helperPath, "unused_app_repo_name", "unused_curr_link")
 	servicetest.ReadPID(t, dmh)
@@ -69,7 +57,7 @@
 	}
 
 	// Get pid of instance[0]
-	pid := getPid(t, ctx, appID, instances[0])
+	pid := utiltest.GetPid(t, ctx, appID, instances[0])
 
 	// Shutdown the first device manager.
 	syscall.Kill(dmh.Pid(), syscall.SIGINT)
@@ -108,7 +96,7 @@
 	pingCh.VerifyPingArgs(t, utiltest.UserName(t), "default", "")
 
 	// Kill instance[1]
-	pid = getPid(t, ctx, appID, instances[1])
+	pid = utiltest.GetPid(t, ctx, appID, instances[1])
 	syscall.Kill(pid, 9)
 
 	// Make a fourth instance. This forces a polling of processes so that

diff --git a/services/device/internal/impl/utiltest/helpers.go b/services/device/internal/impl/utiltest/helpers.go
index 8e9d88a..d7e262b 100644
--- a/services/device/internal/impl/utiltest/helpers.go
+++ b/services/device/internal/impl/utiltest/helpers.go

@@ -720,3 +720,13 @@
 		}
 	}
 }
+
+func GetPid(t *testing.T, ctx *context.T, appID, instanceID string) int {
+	name := naming.Join("dm", "apps/"+appID+"/"+instanceID+"/stats/system/pid")
+	c := stats.StatsClient(name)
+	v, err := c.Value(ctx)
+	if err != nil {
+		t.Fatalf("Value() failed: %v", err)
+	}
+	return int(v.Int())
+}
commit	9e10d4fc2b003869a9b42a3224d6d6c9027d11e8	[log] [tgz]
author	Robert Kroeger <rjkroege@google.com>	Thu May 07 16:33:04 2015 -0700
committer	Robert Kroeger <rjkroege@google.com>	Mon May 11 18:41:31 2015 -0700
tree	dc872d3622ea53e211308c7c3a367a26154f4ce8
parent	cef93575ecb021e7733b255f5600997e3f98457b [diff]