Merge "services/device/internal/impl: daemon mode refactoring"
diff --git a/services/device/internal/impl/app_service.go b/services/device/internal/impl/app_service.go
index 8b1ce68..00ceb43 100644
--- a/services/device/internal/impl/app_service.go
+++ b/services/device/internal/impl/app_service.go
@@ -190,15 +190,15 @@
 	keyMgrAgent *keymgr.Agent
 }
 
-// appStart is the subset of the appService object needed to
+// appRunner is the subset of the appService object needed to
 // (re)start an application.
-type appStartState struct {
-	systemName  string
-	instanceDir string
-	callback    *callbackState
+type appRunner struct {
+	callback *callbackState
 	// securityAgent holds state related to the security agent (nil if not
 	// using the agent).
 	securityAgent *securityAgentState
+	// reap is the app process monitoring subsystem.
+	reap *reaper
 	// mtAddress is the address of the local mounttable.
 	mtAddress string
 }
@@ -214,9 +214,8 @@
 	permsStore *pathperms.PathStore
 	// Reference to the devicemanager top-level AccessList list.
 	deviceAccessList access.Permissions
-	// reap is the app process monitoring subsystem.
-	reap     reaper
-	appStart *appStartState
+	// State needed to (re)start an application.
+	runner *appRunner
 }
 
 func saveEnvelope(ctx *context.T, dir string, envelope *application.Envelope) error {
@@ -242,6 +241,15 @@
 	return envelope, nil
 }
 
+func loadEnvelopeForInstance(ctx *context.T, instanceDir string) (*application.Envelope, error) {
+	versionLink := filepath.Join(instanceDir, "version")
+	versionDir, err := filepath.EvalSymlinks(versionLink)
+	if err != nil {
+		return nil, verror.New(ErrOperationFailed, ctx, fmt.Sprintf("EvalSymlinks(%v) failed: %v", versionLink, err))
+	}
+	return loadEnvelope(ctx, versionDir)
+}
+
 func saveConfig(ctx *context.T, dir string, config device.Config) error {
 	jsonConfig, err := json.Marshal(config)
 	if err != nil {
@@ -712,7 +720,7 @@
 		return instanceDir, instanceID, verror.New(ErrOperationFailed, ctx, fmt.Sprintf("Symlink(%v, %v) failed: %v", packagesDir, packagesLink, err))
 	}
 	instanceInfo := new(instanceInfo)
-	if err := setupPrincipal(ctx, instanceDir, call, i.appStart.securityAgent, instanceInfo); err != nil {
+	if err := setupPrincipal(ctx, instanceDir, call, i.runner.securityAgent, instanceInfo); err != nil {
 		return instanceDir, instanceID, err
 	}
 	if err := saveInstanceInfo(ctx, instanceDir, instanceInfo); err != nil {
@@ -735,10 +743,11 @@
 	return instanceDir, instanceID, nil
 }
 
-func (i appStartState) genCmd(ctx *context.T) (*exec.Cmd, error) {
-	instanceDir := i.instanceDir
-	systemName := i.systemName
-	nsRoot := i.mtAddress
+func genCmd(ctx *context.T, instanceDir string, nsRoot string) (*exec.Cmd, error) {
+	systemName, err := readSystemNameForInstance(instanceDir)
+	if err != nil {
+		return nil, err
+	}
 
 	versionLink := filepath.Join(instanceDir, "version")
 	versionDir, err := filepath.EvalSymlinks(versionLink)
@@ -802,8 +811,7 @@
 	return suidHelper.getAppCmd(&saArgs)
 }
 
-func (i appStartState) startCmd(ctx *context.T, cmd *exec.Cmd) (int, error) {
-	instanceDir := i.instanceDir
+func (i *appRunner) startCmd(ctx *context.T, instanceDir string, cmd *exec.Cmd) (int, error) {
 	info, err := loadInstanceInfo(ctx, instanceDir)
 	if err != nil {
 		return 0, err
@@ -905,27 +913,51 @@
 	return pid, nil
 }
 
-func (i appStartState) run(ctx *context.T, reap reaper) error {
-	if err := transitionInstance(i.instanceDir, device.InstanceStateNotRunning, device.InstanceStateLaunching); err != nil {
+func (i *appRunner) run(ctx *context.T, instanceDir string) error {
+	if err := transitionInstance(instanceDir, device.InstanceStateNotRunning, device.InstanceStateLaunching); err != nil {
 		return err
 	}
 	var pid int
 
-	cmd, err := i.genCmd(ctx)
+	cmd, err := genCmd(ctx, instanceDir, i.mtAddress)
 	if err == nil {
-		pid, err = i.startCmd(ctx, cmd)
+		pid, err = i.startCmd(ctx, instanceDir, cmd)
 	}
 	if err != nil {
-		transitionInstance(i.instanceDir, device.InstanceStateLaunching, device.InstanceStateNotRunning)
+		transitionInstance(instanceDir, device.InstanceStateLaunching, device.InstanceStateNotRunning)
 		return err
 	}
-	if err := transitionInstance(i.instanceDir, device.InstanceStateLaunching, device.InstanceStateRunning); err != nil {
+	if err := transitionInstance(instanceDir, device.InstanceStateLaunching, device.InstanceStateRunning); err != nil {
 		return err
 	}
-	reap.startWatching(i.instanceDir, pid)
+	i.reap.startWatching(instanceDir, pid)
 	return nil
 }
 
+func (i *appRunner) restartAppIfNecessary(instanceDir string) {
+	info, err := loadInstanceInfo(nil, instanceDir)
+	if err != nil {
+		vlog.Error(err)
+		return
+	}
+
+	envelope, err := loadEnvelopeForInstance(nil, instanceDir)
+	if err != nil {
+		vlog.Error(err)
+		return
+	}
+
+	// Determine if we should restart.
+	if !neverStart().decide(envelope, info) {
+		return
+	}
+
+	// TODO(rjkroege): Implement useful restart policy.
+	if err := i.run(nil, instanceDir); err != nil {
+		vlog.Error(err)
+	}
+}
+
 func (i *appService) Instantiate(ctx *context.T, call device.ApplicationInstantiateServerCall) (string, error) {
 	helper := i.config.Helper
 	instanceDir, instanceID, err := i.newInstance(ctx, call)
@@ -976,10 +1008,7 @@
 	if startSystemName != systemName {
 		return verror.New(verror.ErrNoAccess, ctx, "Not allowed to resume an application under a different system name.")
 	}
-
-	i.appStart.instanceDir = instanceDir
-	i.appStart.systemName = systemName
-	return i.appStart.run(ctx, i.reap)
+	return i.runner.run(ctx, instanceDir)
 }
 
 func stopAppRemotely(ctx *context.T, appVON string, deadline time.Duration) error {
@@ -1003,7 +1032,7 @@
 	return nil
 }
 
-func stop(ctx *context.T, instanceDir string, reap reaper, deadline time.Duration) error {
+func stop(ctx *context.T, instanceDir string, reap *reaper, deadline time.Duration) error {
 	info, err := loadInstanceInfo(ctx, instanceDir)
 	if err != nil {
 		return err
@@ -1032,7 +1061,7 @@
 	if err := transitionInstance(instanceDir, device.InstanceStateRunning, device.InstanceStateDying); err != nil {
 		return err
 	}
-	if err := stop(ctx, instanceDir, i.reap, deadline); err != nil {
+	if err := stop(ctx, instanceDir, i.runner.reap, deadline); err != nil {
 		transitionInstance(instanceDir, device.InstanceStateDying, device.InstanceStateRunning)
 		return err
 	}
@@ -1484,21 +1513,19 @@
 	} else {
 		debugInfo.StartSystemName = startSystemName
 	}
-	as := *i.appStart
-	as.instanceDir = instanceDir
-	as.systemName = debugInfo.SystemName
-	if cmd, err := as.genCmd(ctx); err != nil {
-		return "", err
-	} else {
-		debugInfo.Cmd = cmd
-	}
+
 	if info, err := loadInstanceInfo(ctx, instanceDir); err != nil {
 		return "", err
 	} else {
 		debugInfo.Info = info
 	}
+	if cmd, err := genCmd(ctx, instanceDir, i.runner.mtAddress); err != nil {
+		return "", err
+	} else {
+		debugInfo.Cmd = cmd
+	}
 
-	if sa := i.appStart.securityAgent; sa != nil {
+	if sa := i.runner.securityAgent; sa != nil {
 		file, err := sa.keyMgrAgent.NewConnection(debugInfo.Info.SecurityAgentHandle)
 		if err != nil {
 			vlog.Errorf("NewConnection(%v) failed: %v", debugInfo.Info.SecurityAgentHandle, err)
diff --git a/services/device/internal/impl/dispatcher.go b/services/device/internal/impl/dispatcher.go
index 86f57f4..62d4787 100644
--- a/services/device/internal/impl/dispatcher.go
+++ b/services/device/internal/impl/dispatcher.go
@@ -41,6 +41,8 @@
 	securityAgent  *securityAgentState
 	restartHandler func()
 	testMode       bool
+	// reap is the app process monitoring subsystem.
+	reap *reaper
 }
 
 // dispatcher holds the state of the device manager dispatcher.
@@ -57,8 +59,6 @@
 	permsStore *pathperms.PathStore
 	// Namespace
 	mtAddress string // The address of the local mounttable.
-	// reap is the app process monitoring subsystem.
-	reap reaper
 }
 
 var _ rpc.Dispatcher = (*dispatcher)(nil)
@@ -118,10 +118,6 @@
 	if err != nil {
 		return nil, verror.New(errCantCreateAccountStore, ctx, err)
 	}
-	reap, err := newReaper(ctx, config.Root)
-	if err != nil {
-		return nil, verror.New(errCantCreateAppWatcher, ctx, err)
-	}
 	initSuidHelper(config.Helper)
 	d := &dispatcher{
 		internal: &internalState{
@@ -134,7 +130,6 @@
 		uat:        uat,
 		permsStore: permStore,
 		mtAddress:  mtAddress,
-		reap:       reap,
 	}
 
 	// If we're in 'security agent mode', set up the key manager agent.
@@ -147,6 +142,15 @@
 			}
 		}
 	}
+	reap, err := newReaper(ctx, config.Root, &appRunner{
+		callback:      d.internal.callback,
+		securityAgent: d.internal.securityAgent,
+	})
+	if err != nil {
+		return nil, verror.New(errCantCreateAppWatcher, ctx, err)
+	}
+	d.internal.reap = reap
+
 	if testMode {
 		return &testModeDispatcher{d}, nil
 	}
@@ -157,7 +161,7 @@
 func Shutdown(rpcd rpc.Dispatcher) {
 	switch d := rpcd.(type) {
 	case *dispatcher:
-		d.reap.shutdown()
+		d.internal.reap.shutdown()
 	case *testModeDispatcher:
 		Shutdown(d.realDispatcher)
 	default:
@@ -326,8 +330,8 @@
 			suffix:     components[1:],
 			uat:        d.uat,
 			permsStore: d.permsStore,
-			reap:       d.reap,
-			appStart: &appStartState{
+			runner: &appRunner{
+				reap:          d.internal.reap,
 				callback:      d.internal.callback,
 				securityAgent: d.internal.securityAgent,
 				mtAddress:     d.mtAddress,
diff --git a/services/device/internal/impl/instance_reaping.go b/services/device/internal/impl/instance_reaping.go
index 59ae4b0..e300283 100644
--- a/services/device/internal/impl/instance_reaping.go
+++ b/services/device/internal/impl/instance_reaping.go
@@ -43,11 +43,14 @@
 	pid         int
 }
 
-type reaper chan pidInstanceDirPair
+type reaper struct {
+	c          chan pidInstanceDirPair
+	startState *appRunner
+}
 
 var stashedPidMap map[string]int
 
-func newReaper(ctx *context.T, root string) (reaper, error) {
+func newReaper(ctx *context.T, root string, startState *appRunner) (*reaper, error) {
 	pidMap, err := findAllTheInstances(ctx, root)
 
 	// Used only by the testing code that verifies that all processes
@@ -57,9 +60,13 @@
 		return nil, err
 	}
 
-	c := make(reaper)
-	go processStatusPolling(c, pidMap)
-	return c, nil
+	r := &reaper{
+		c:          make(chan pidInstanceDirPair),
+		startState: startState,
+	}
+	r.startState.reap = r
+	go r.processStatusPolling(pidMap)
+	return r, nil
 }
 
 func markNotRunning(idir string) {
@@ -77,7 +84,7 @@
 // functionality. For example, use the kevent facility in darwin or
 // replace init. See http://www.incenp.org/dvlpt/wait4.html for
 // inspiration.
-func processStatusPolling(r reaper, trackedPids map[string]int) {
+func (r *reaper) processStatusPolling(trackedPids map[string]int) {
 	poll := func() {
 		for idir, pid := range trackedPids {
 			switch err := syscall.Kill(pid, 0); err {
@@ -85,6 +92,7 @@
 				// No such PID.
 				vlog.VI(2).Infof("processStatusPolling discovered pid %d ended", pid)
 				markNotRunning(idir)
+				r.startState.restartAppIfNecessary(idir)
 				delete(trackedPids, idir)
 			case nil, syscall.EPERM:
 				vlog.VI(2).Infof("processStatusPolling saw live pid: %d", pid)
@@ -99,6 +107,11 @@
 				// TODO(rjkroege): Probe the appcycle service of the app
 				// to confirm that its pid is valid iff v23PIDMgmt
 				// is false.
+
+				// TODO(rjkroege): if we can't connect to the app here via
+				// the appcycle manager, the app was probably started under
+				// a different agent and cannot be managed. Perhaps we should
+				// then kill the app and restart it?
 			default:
 				// The kill system call manpage says that this can only happen
 				// if the kernel claims that 0 is an invalid signal.
@@ -111,7 +124,7 @@
 
 	for {
 		select {
-		case p, ok := <-r:
+		case p, ok := <-r.c:
 			switch {
 			case !ok:
 				return
@@ -139,22 +152,22 @@
 // startWatching begins watching process pid's state. This routine
 // assumes that pid already exists. Since pid is delivered to the device
 // manager by RPC callback, this seems reasonable.
-func (r reaper) startWatching(idir string, pid int) {
-	r <- pidInstanceDirPair{instanceDir: idir, pid: pid}
+func (r *reaper) startWatching(idir string, pid int) {
+	r.c <- pidInstanceDirPair{instanceDir: idir, pid: pid}
 }
 
 // stopWatching stops watching process pid's state.
-func (r reaper) stopWatching(idir string) {
-	r <- pidInstanceDirPair{instanceDir: idir, pid: -1}
+func (r *reaper) stopWatching(idir string) {
+	r.c <- pidInstanceDirPair{instanceDir: idir, pid: -1}
 }
 
 // forciblySuspend terminates the process pid
-func (r reaper) forciblySuspend(idir string) {
-	r <- pidInstanceDirPair{instanceDir: idir, pid: -2}
+func (r *reaper) forciblySuspend(idir string) {
+	r.c <- pidInstanceDirPair{instanceDir: idir, pid: -2}
 }
 
-func (r reaper) shutdown() {
-	close(r)
+func (r *reaper) shutdown() {
+	close(r.c)
 }
 
 type pidErrorTuple struct {
diff --git a/services/device/internal/impl/restart_policy.go b/services/device/internal/impl/restart_policy.go
new file mode 100644
index 0000000..57c6cff
--- /dev/null
+++ b/services/device/internal/impl/restart_policy.go
@@ -0,0 +1,36 @@
+// Copyright 2015 The Vanadium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package impl
+
+import (
+	"v.io/v23/services/application"
+)
+
+// RestartPolicy instances provide a policy for deciding if an
+// application should be restarted on failure.
+type restartPolicy interface {
+	// decide determines if this application instance should be (re)started, returning
+	// true if the application should be be (re)started.
+	decide(envelope *application.Envelope, instance *instanceInfo) bool
+}
+
+// startStub implements a stub RestartPolicy.
+type startStub struct {
+	start bool
+}
+
+// alwaysStart returns a RestartPolicy that always (re)starts the application.
+func alwaysStart() restartPolicy {
+	return startStub{true}
+}
+
+// neverStart returns a RestartPolicy that never (re)starts the application.
+func neverStart() restartPolicy {
+	return startStub{false}
+}
+
+func (s startStub) decide(envelope *application.Envelope, instance *instanceInfo) bool {
+	return s.start
+}