Merge "services/device/device/publish.go: specifiable envelope name"
diff --git a/services/device/internal/impl/app_service.go b/services/device/internal/impl/app_service.go
index 19b2970..b3163cb 100644
--- a/services/device/internal/impl/app_service.go
+++ b/services/device/internal/impl/app_service.go
@@ -160,6 +160,8 @@
 	Pid                      int
 	DeviceManagerPeerPattern string
 	SecurityAgentHandle      []byte
+	Restarts                 int32
+	RestartWindowBegan       time.Time
 }
 
 func saveInstanceInfo(ctx *context.T, dir string, info *instanceInfo) error {
@@ -957,25 +959,45 @@
 	return nil
 }
 
-func (i *appRunner) restartAppIfNecessary(instanceDir string) {
+func synchronizedShouldRestart(instanceDir string) bool {
 	info, err := loadInstanceInfo(nil, instanceDir)
 	if err != nil {
 		vlog.Error(err)
-		return
+		return false
 	}
 
 	envelope, err := loadEnvelopeForInstance(nil, instanceDir)
 	if err != nil {
 		vlog.Error(err)
+		return false
+	}
+
+	shouldRestart := newBasicRestartPolicy().decide(envelope, info)
+
+	if err := saveInstanceInfo(nil, instanceDir, info); err != nil {
+		vlog.Error(err)
+		return false
+	}
+	return shouldRestart
+}
+
+func (i *appRunner) restartAppIfNecessary(instanceDir string) {
+	if err := transitionInstance(instanceDir, device.InstanceStateNotRunning, device.InstanceStateLaunching); err != nil {
+		vlog.Error(err)
 		return
 	}
 
-	// Determine if we should restart.
-	if !neverStart().decide(envelope, info) {
+	shouldRestart := synchronizedShouldRestart(instanceDir)
+
+	if err := transitionInstance(instanceDir, device.InstanceStateLaunching, device.InstanceStateNotRunning); err != nil {
+		vlog.Error(err)
 		return
 	}
 
-	// TODO(rjkroege): Implement useful restart policy.
+	if !shouldRestart {
+		return
+	}
+
 	if err := i.run(nil, instanceDir); err != nil {
 		vlog.Error(err)
 	}
diff --git a/services/device/internal/impl/daemonreap/daemon_reaping_test.go b/services/device/internal/impl/daemonreap/daemon_reaping_test.go
index bcf4fc9..765eb82 100644
--- a/services/device/internal/impl/daemonreap/daemon_reaping_test.go
+++ b/services/device/internal/impl/daemonreap/daemon_reaping_test.go
@@ -9,6 +9,7 @@
 	"testing"
 	"time"
 
+	"v.io/v23/context"
 	"v.io/v23/naming"
 	"v.io/v23/services/device"
 	"v.io/v23/services/stats"
@@ -35,7 +36,7 @@
 	utiltest.Resolve(t, ctx, "pingserver", 1, true)
 
 	// Create an envelope for a first version of the app that will be restarted once.
-	*envelope = utiltest.EnvelopeFromShell(sh, nil, utiltest.App, "google naps", 1, 10*time.Second, "appV1")
+	*envelope = utiltest.EnvelopeFromShell(sh, nil, utiltest.App, "google naps", 1, 10*time.Minute, "appV1")
 	appID := utiltest.InstallApp(t, ctx)
 
 	// Start an instance of the app.
@@ -45,7 +46,58 @@
 	pingCh.VerifyPingArgs(t, utiltest.UserName(t), "default", "")
 
 	// Get application pid.
-	name := naming.Join("dm", "apps/"+appID+"/"+instance1ID+"/stats/system/pid")
+	pid := getPid(t, ctx, appID, instance1ID)
+
+	utiltest.VerifyState(t, ctx, device.InstanceStateRunning, appID, instance1ID)
+	syscall.Kill(int(pid), 9)
+	pollingWait(t, int(pid))
+
+	// Start a second instance of the app which will force polling to happen.
+	// During this polling, the reaper will restart app instance1
+	instance2ID := utiltest.LaunchApp(t, ctx, appID)
+	pingCh.VerifyPingArgs(t, utiltest.UserName(t), "default", "")
+	utiltest.VerifyState(t, ctx, device.InstanceStateRunning, appID, instance2ID)
+
+	// Stop the second instance of the app which will also force polling. By this point,
+	// instance1 should be live.
+	utiltest.KillApp(t, ctx, appID, instance2ID)
+
+	// instance2ID is not running.
+	utiltest.VerifyState(t, ctx, device.InstanceStateNotRunning, appID, instance2ID)
+
+	// instance1ID was restarted automatically.
+	utiltest.VerifyState(t, ctx, device.InstanceStateRunning, appID, instance1ID)
+
+	// Get application pid.
+	pid = getPid(t, ctx, appID, instance1ID)
+	// Be sure to get the ping from the restarted application so that we block in
+	// RunApp below.
+	pingCh.WaitForPingArgs(t)
+	// Kill the application again.
+	syscall.Kill(int(pid), 9)
+	pollingWait(t, int(pid))
+
+	// Start and stop instance 2 again to force two polling cycles.
+	utiltest.RunApp(t, ctx, appID, instance2ID)
+	pingCh.VerifyPingArgs(t, utiltest.UserName(t), "default", "")
+	utiltest.KillApp(t, ctx, appID, instance2ID)
+
+	// instance1ID is not running because it exceeded its restart limit.
+	utiltest.VerifyState(t, ctx, device.InstanceStateNotRunning, appID, instance1ID)
+
+	// instance2ID is not running.
+	utiltest.VerifyState(t, ctx, device.InstanceStateNotRunning, appID, instance2ID)
+
+	// Cleanly shut down the device manager.
+	utiltest.VerifyNoRunningProcesses(t)
+	syscall.Kill(dmh.Pid(), syscall.SIGINT)
+	dmh.Expect("dm terminated")
+	dmh.ExpectEOF()
+}
+
+// getPid returns the application pid
+func getPid(t *testing.T, ctx *context.T, appID, instanceID string) int {
+	name := naming.Join("dm", "apps/"+appID+"/"+instanceID+"/stats/system/pid")
 	c := stats.StatsClient(name)
 	v, err := c.Value(ctx)
 	if err != nil {
@@ -55,33 +107,19 @@
 	if err := vdl.Convert(&pid, v); err != nil {
 		t.Fatalf("pid returned from stats interface is not an int: %v", err)
 	}
+	return pid
+}
 
-	utiltest.VerifyState(t, ctx, device.InstanceStateRunning, appID, instance1ID)
-	syscall.Kill(int(pid), 9)
-
-	// Start a second instance of the app which will force polling to happen.
-	instance2ID := utiltest.LaunchApp(t, ctx, appID)
-	pingCh.VerifyPingArgs(t, utiltest.UserName(t), "default", "")
-
-	// TODO(rjkroege): Because there is no daemon mode, instance1ID is not running even
-	// though it should be.
-	utiltest.VerifyState(t, ctx, device.InstanceStateNotRunning, appID, instance1ID)
-
-	// TODO(rjkroege): Demonstrate that the device manager will only restart the app the
-	// configured number of times (1)
-
-	// instance2ID is still running though.
-	utiltest.VerifyState(t, ctx, device.InstanceStateRunning, appID, instance2ID)
-
-	// Cleanup.
-	utiltest.TerminateApp(t, ctx, appID, instance2ID)
-
-	// TODO(rjkroege): instance1ID isn't running but should be.
-	// utiltest.TerminateApp(t, ctx, appID, instance1ID)
-
-	// Cleanly shut down the device manager.
-	utiltest.VerifyNoRunningProcesses(t)
-	syscall.Kill(dmh.Pid(), syscall.SIGINT)
-	dmh.Expect("dm terminated")
-	dmh.ExpectEOF()
+// pollingWait waits until the specificed process is actually dead so that
+// that the test can be certain that the process is actually dead before
+// continuing.
+func pollingWait(t *testing.T, pid int) {
+	for {
+		switch err := syscall.Kill(pid, 0); {
+		case err == syscall.ESRCH:
+			return
+		case err != nil:
+			t.Fatalf("syscall.Kill not working as expected: %v", err)
+		}
+	}
 }
diff --git a/services/device/internal/impl/instance_reaping.go b/services/device/internal/impl/instance_reaping.go
index e300283..16cee82 100644
--- a/services/device/internal/impl/instance_reaping.go
+++ b/services/device/internal/impl/instance_reaping.go
@@ -92,7 +92,7 @@
 				// No such PID.
 				vlog.VI(2).Infof("processStatusPolling discovered pid %d ended", pid)
 				markNotRunning(idir)
-				r.startState.restartAppIfNecessary(idir)
+				go r.startState.restartAppIfNecessary(idir)
 				delete(trackedPids, idir)
 			case nil, syscall.EPERM:
 				vlog.VI(2).Infof("processStatusPolling saw live pid: %d", pid)
@@ -130,6 +130,7 @@
 				return
 			case p.pid == -1: // stop watching this instance
 				delete(trackedPids, p.instanceDir)
+				poll()
 			case p.pid == -2: // kill the process
 				if pid, ok := trackedPids[p.instanceDir]; ok {
 					if err := suidHelper.terminatePid(pid, nil, nil); err != nil {
diff --git a/services/device/internal/impl/restart_policy.go b/services/device/internal/impl/restart_policy.go
index 57c6cff..465d0e0 100644
--- a/services/device/internal/impl/restart_policy.go
+++ b/services/device/internal/impl/restart_policy.go
@@ -5,6 +5,8 @@
 package impl
 
 import (
+	"time"
+
 	"v.io/v23/services/application"
 )
 
@@ -16,21 +18,37 @@
 	decide(envelope *application.Envelope, instance *instanceInfo) bool
 }
 
-// startStub implements a stub RestartPolicy.
-type startStub struct {
-	start bool
+type basicDecisionPolicy struct {
 }
 
-// alwaysStart returns a RestartPolicy that always (re)starts the application.
-func alwaysStart() restartPolicy {
-	return startStub{true}
+func newBasicRestartPolicy() restartPolicy {
+	return new(basicDecisionPolicy)
 }
 
-// neverStart returns a RestartPolicy that never (re)starts the application.
-func neverStart() restartPolicy {
-	return startStub{false}
-}
+func (rp *basicDecisionPolicy) decide(envelope *application.Envelope, instance *instanceInfo) bool {
+	if envelope.Restarts == 0 {
+		return false
+	}
+	if envelope.Restarts < 0 {
+		return true
+	}
 
-func (s startStub) decide(envelope *application.Envelope, instance *instanceInfo) bool {
-	return s.start
+	if instance.Restarts == 0 {
+		instance.RestartWindowBegan = time.Now()
+	}
+
+	endOfWindow := instance.RestartWindowBegan.Add(envelope.RestartTimeWindow)
+	if time.Now().After(endOfWindow) {
+		instance.Restarts = 1
+		instance.RestartWindowBegan = time.Now()
+		return true
+	}
+
+	if instance.Restarts < envelope.Restarts {
+		instance.Restarts++
+		return true
+	}
+
+	return false
+
 }