veyron/services/mgmt/node/impl: implement Resume and Suspend

Details:

- factor out common code shared between Resume and Start, and between Suspend
  and Stop respectively; break up creating and running a command, and also
  creating an instance from running the instance

- introduce a different way to track the state of the instance; instead of
  renaming the instance dir to "stopped-<>", we now have a status file inside
  each instance dir, whose name reflects the current state of the instance
  ("suspended", "started", "stopped", "starting", "suspending",
  "stopping"). Transition between these states is marked with os.Rename, which
  should be atomic.

- add version link from each instance to the installation version it was created
  from; needed by Resume

Change-Id: I3bc9cc8eada505edebb8117b0bf9313bbe81898d
diff --git a/services/mgmt/node/impl/app_invoker.go b/services/mgmt/node/impl/app_invoker.go
index ef8301a..3571978 100644
--- a/services/mgmt/node/impl/app_invoker.go
+++ b/services/mgmt/node/impl/app_invoker.go
@@ -23,10 +23,9 @@
 //           logs/                  - stderr/stdout and log files generated by instance
 //           info                   - app manager name and process id for the instance (if running)
 //           version                - symbolic link to installation version for the instance
+//           <status>               - one of the values for instanceState enum
 //         instance-<id b>
 //         ...
-//         stopped-instance-<id c>  - stopped instances have their directory name prepended by 'stopped-'
-//         ...
 //     installation-<id 2>
 //     ...
 //   app-<hash 2>
@@ -48,13 +47,31 @@
 // Concurrency model: installations can be created independently of one another;
 // installations can be removed at any time (any running instances will be
 // stopped). The first call to Uninstall will rename the installation dir as a
-// first step; subsequent Uninstalls will fail. Instances can be created
+// first step; subsequent Uninstall's will fail. Instances can be created
 // independently of one another, as long as the installation exists (if it gets
-// Uninstalled during an instance Start, the Start may fail). When an instance
-// is stopped, the first call to Stop renames the instance dir; subsequent Stop
-// calls will fail. Resume will attempt to create an info file; if one exists
-// already, Resume fails. Suspend will attempt to rename the info file; if none
-// present, Suspend will fail.
+// Uninstall'ed during an instance Start, the Start may fail).
+//
+// The status file present in each instance is used to flag the state of the
+// instance and prevent concurrent operations against the instance:
+//
+// - when an instance is created with Start, it is placed in state 'suspended'.
+// To run the instance, Start transitions 'suspended' to 'starting' and then
+// 'started' (upon success) or the instance is deleted (upon failure).
+//
+// - Suspend attempts to transition from 'started' to 'suspending' (if the
+// instance was not in 'started' state, Suspend fails). From 'suspending', the
+// instance transitions to 'suspended' upon success or back to 'started' upon
+// failure.
+//
+// - Resume attempts to transition from 'suspended' to 'starting' (if the
+// instance was not in 'suspended' state, Resume fails). From 'starting', the
+// instance transitions to 'started' upon success or back to 'suspended' upon
+// failure.
+//
+// - Stop attempts to transition from 'started' to 'stopping' and then to
+// 'stopped' (upon success) or back to 'started' (upon failure); or from
+// 'suspended' to 'stopped'.  If the initial state is neither 'started' or
+// 'suspended', Stop fails.
 //
 // TODO(caprita): There is room for synergy between how node manager organizes
 // its own workspace and that for the applications it runs.  In particular,
@@ -88,6 +105,61 @@
 	"veyron2/vlog"
 )
 
+// instanceState describes the states that an instance can be in at any time.
+type instanceState int
+
+const (
+	starting instanceState = iota
+	started
+	suspending
+	suspended
+	stopping
+	stopped
+)
+
+// String returns the name that will be used to encode the state as a file name
+// in the instance's dir.
+func (s instanceState) String() string {
+	switch s {
+	case starting:
+		return "starting"
+	case started:
+		return "started"
+	case suspending:
+		return "suspending"
+	case suspended:
+		return "suspended"
+	case stopping:
+		return "stopping"
+	case stopped:
+		return "stopped"
+	default:
+		return "unknown"
+	}
+}
+
+func transition(instanceDir string, initial, target instanceState) error {
+	initialState := filepath.Join(instanceDir, initial.String())
+	targetState := filepath.Join(instanceDir, target.String())
+	if err := os.Rename(initialState, targetState); err != nil {
+		if os.IsNotExist(err) {
+			return errInvalidOperation
+		}
+		vlog.Errorf("Rename(%v, %v) failed: %v", initialState, targetState, err) // Something went really wrong.
+		return errOperationFailed
+	}
+	return nil
+}
+
+func initializeState(instanceDir string, initial instanceState) error {
+	initialStatus := filepath.Join(instanceDir, initial.String())
+	if err := ioutil.WriteFile(initialStatus, []byte("status"), 0600); err != nil {
+		vlog.Errorf("WriteFile(%v) failed: %v", initialStatus, err)
+		return errOperationFailed
+	}
+	return nil
+}
+
 // instanceInfo holds state about a running instance.
 type instanceInfo struct {
 	AppCycleMgrName string
@@ -200,8 +272,13 @@
 	return "instance-" + instanceID
 }
 
-func stoppedInstanceDirName(instanceID string) string {
-	return "stopped-instance-" + instanceID
+func mkdir(dir string) error {
+	perm := os.FileMode(0700)
+	if err := os.MkdirAll(dir, perm); err != nil {
+		vlog.Errorf("MkdirAll(%v, %v) failed: %v", dir, perm, err)
+		return err
+	}
+	return nil
 }
 
 func (i *appInvoker) Install(call ipc.ServerContext, applicationVON string) (string, error) {
@@ -222,9 +299,7 @@
 	installationID := generateID()
 	installationDir := filepath.Join(i.config.Root, applicationDirName(envelope.Title), installationDirName(installationID))
 	versionDir := filepath.Join(installationDir, generateVersionDirName())
-	perm := os.FileMode(0700)
-	if err := os.MkdirAll(versionDir, perm); err != nil {
-		vlog.Errorf("MkdirAll(%v, %v) failed: %v", versionDir, perm, err)
+	if err := mkdir(versionDir); err != nil {
 		return "", errOperationFailed
 	}
 	deferrer := func() {
@@ -266,11 +341,6 @@
 	return nil
 }
 
-func (*appInvoker) Resume(ipc.ServerContext) error {
-	// TODO(jsimsa): Implement.
-	return nil
-}
-
 func (*appInvoker) Revert(ipc.ServerContext) error {
 	// TODO(jsimsa): Implement.
 	return nil
@@ -285,20 +355,17 @@
 	// the app (to point to the device mounttable).
 	cmd.Env = envelope.Env
 	rootDir := filepath.Join(instanceDir, "root")
-	perm := os.FileMode(0700)
-	if err := os.MkdirAll(rootDir, perm); err != nil {
-		vlog.Errorf("MkdirAll(%v, %v) failed: %v", rootDir, perm, err)
+	if err := mkdir(rootDir); err != nil {
 		return nil, err
 	}
 	cmd.Dir = rootDir
 	logDir := filepath.Join(instanceDir, "logs")
-	if err := os.MkdirAll(logDir, perm); err != nil {
-		vlog.Errorf("MkdirAll(%v, %v) failed: %v", logDir, perm, err)
+	if err := mkdir(logDir); err != nil {
 		return nil, err
 	}
 	timestamp := time.Now().UnixNano()
 	var err error
-	perm = os.FileMode(0600)
+	perm := os.FileMode(0600)
 	cmd.Stdout, err = os.OpenFile(filepath.Join(logDir, fmt.Sprintf("STDOUT-%d", timestamp)), os.O_WRONLY|os.O_CREATE, perm)
 	if err != nil {
 		return nil, err
@@ -335,29 +402,60 @@
 	return installationDir, nil
 }
 
-func (i *appInvoker) Start(ipc.ServerContext) ([]string, error) {
+// newInstance sets up the directory for a new application instance.
+func (i *appInvoker) newInstance() (string, string, error) {
 	installationDir, err := i.installationDir()
 	if err != nil {
-		return nil, err
+		return "", "", err
+	}
+	instanceID := generateID()
+	instanceDir := filepath.Join(installationDir, "instances", instanceDirName(instanceID))
+	if mkdir(instanceDir) != nil {
+		return "", instanceID, errOperationFailed
 	}
 	currLink := filepath.Join(installationDir, "current")
-	envelope, err := loadEnvelope(currLink)
+	versionDir, err := filepath.EvalSymlinks(currLink)
+	if err != nil {
+		vlog.Errorf("EvalSymlinks(%v) failed: %v", currLink, err)
+		return instanceDir, instanceID, err
+	}
+	versionLink := filepath.Join(instanceDir, "version")
+	if err := os.Symlink(versionDir, versionLink); err != nil {
+		vlog.Errorf("Symlink(%v, %v) failed: %v", versionDir, versionLink, err)
+		return instanceDir, instanceID, errOperationFailed
+	}
+	if err := initializeState(instanceDir, suspended); err != nil {
+		return instanceDir, instanceID, err
+	}
+	return instanceDir, instanceID, nil
+}
+
+func genCmd(instanceDir string) (*exec.Cmd, error) {
+	versionLink := filepath.Join(instanceDir, "version")
+	versionDir, err := filepath.EvalSymlinks(versionLink)
+	if err != nil {
+		vlog.Errorf("EvalSymlinks(%v) failed: %v", versionLink, err)
+		return nil, errOperationFailed
+	}
+	envelope, err := loadEnvelope(versionDir)
 	if err != nil {
 		return nil, err
 	}
-	binPath := filepath.Join(currLink, "bin")
+	binPath := filepath.Join(versionDir, "bin")
 	if _, err := os.Stat(binPath); err != nil {
 		vlog.Errorf("Stat(%v) failed: %v", binPath, err)
 		return nil, errOperationFailed
 	}
-	instanceID := generateID()
-	// TODO(caprita): Clean up instanceDir upon failure.
-	instanceDir := filepath.Join(installationDir, "instances", instanceDirName(instanceID))
+	// TODO(caprita): Fold generateCommand inline here.
 	cmd, err := generateCommand(envelope, binPath, instanceDir)
 	if err != nil {
 		vlog.Errorf("generateCommand(%v, %v, %v) failed: %v", envelope, binPath, instanceDir, err)
 		return nil, errOperationFailed
 	}
+	return cmd, nil
+}
+
+func (i *appInvoker) startCmd(instanceDir string, cmd *exec.Cmd) error {
 	// Setup up the child process callback.
 	callbackState := i.callback
 	listener := callbackState.listenFor(mgmt.AppCycleManagerConfigKey)
@@ -365,55 +463,92 @@
 	cfg := config.New()
 	cfg.Set(mgmt.ParentNodeManagerConfigKey, listener.name())
 	handle := vexec.NewParentHandle(cmd, vexec.ConfigOpt{cfg})
+	defer func() {
+		if handle != nil {
+			if err := handle.Clean(); err != nil {
+				vlog.Errorf("Clean() failed: %v", err)
+			}
+		}
+	}()
 	// Start the child process.
 	if err := handle.Start(); err != nil {
 		vlog.Errorf("Start() failed: %v", err)
-		return nil, errOperationFailed
+		return errOperationFailed
 	}
 	// Wait for the child process to start.
 	timeout := 10 * time.Second
 	if err := handle.WaitForReady(timeout); err != nil {
 		vlog.Errorf("WaitForReady(%v) failed: %v", timeout, err)
-		if err := handle.Clean(); err != nil {
-			vlog.Errorf("Clean() failed: %v", err)
-		}
-		return nil, errOperationFailed
+		return errOperationFailed
 	}
 	childName, err := listener.waitForValue(timeout)
 	if err != nil {
-		if err := handle.Clean(); err != nil {
-			vlog.Errorf("Clean() failed: %v", err)
-		}
-		return nil, errOperationFailed
+		return errOperationFailed
 	}
 	instanceInfo := &instanceInfo{
 		AppCycleMgrName: childName,
 		Pid:             handle.Pid(),
 	}
 	if err := saveInstanceInfo(instanceDir, instanceInfo); err != nil {
-		if err := handle.Clean(); err != nil {
-			vlog.Errorf("Clean() failed: %v", err)
-		}
-		return nil, err
+		return err
 	}
 	// TODO(caprita): Spin up a goroutine to reap child status upon exit and
 	// transition it to suspended state if it exits on its own.
+	handle = nil
+	return nil
+}
+
+func (i *appInvoker) run(instanceDir string) error {
+	if err := transition(instanceDir, suspended, starting); err != nil {
+		return err
+	}
+	cmd, err := genCmd(instanceDir)
+	if err == nil {
+		err = i.startCmd(instanceDir, cmd)
+	}
+	if err != nil {
+		transition(instanceDir, starting, suspended)
+		return err
+	}
+	return transition(instanceDir, starting, started)
+}
+
+func (i *appInvoker) Start(ipc.ServerContext) ([]string, error) {
+	instanceDir, instanceID, err := i.newInstance()
+	if err == nil {
+		err = i.run(instanceDir)
+	}
+	if err != nil {
+		if instanceDir != "" {
+			if err := os.RemoveAll(instanceDir); err != nil {
+				vlog.Errorf("RemoveAll(%v) failed: %v", instanceDir, err)
+			}
+		}
+		return nil, err
+	}
 	return []string{instanceID}, nil
 }
 
 // instanceDir returns the path to the directory containing the app instance
 // referred to by the invoker's suffix, as well as the corresponding stopped
 // instance dir.  Returns an error if the suffix does not name an instance.
-func (i *appInvoker) instanceDir() (string, string, error) {
+func (i *appInvoker) instanceDir() (string, error) {
 	components := i.suffix
 	if nComponents := len(components); nComponents != 3 {
-		return "", "", errInvalidSuffix
+		return "", errInvalidSuffix
 	}
 	app, installation, instance := components[0], components[1], components[2]
 	instancesDir := filepath.Join(i.config.Root, applicationDirName(app), installationDirName(installation), "instances")
 	instanceDir := filepath.Join(instancesDir, instanceDirName(instance))
-	stoppedInstanceDir := filepath.Join(instancesDir, stoppedInstanceDirName(instance))
-	return instanceDir, stoppedInstanceDir, nil
+	return instanceDir, nil
+}
+
+func (i *appInvoker) Resume(ipc.ServerContext) error {
+	instanceDir, err := i.instanceDir()
+	if err != nil {
+		return err
+	}
+	return i.run(instanceDir)
 }
 
 func stopAppRemotely(appVON string) error {
@@ -444,32 +579,47 @@
 	return nil
 }
 
-func (i *appInvoker) Stop(_ ipc.ServerContext, deadline uint32) error {
-	// TODO(caprita): implement deadline.
-	instanceDir, stoppedInstanceDir, err := i.instanceDir()
+func stop(instanceDir string) error {
+	info, err := loadInstanceInfo(instanceDir)
 	if err != nil {
 		return err
 	}
-	if err := os.Rename(instanceDir, stoppedInstanceDir); err != nil {
-		vlog.Errorf("Rename(%v, %v) failed: %v", instanceDir, stoppedInstanceDir, err)
-		if os.IsNotExist(err) {
-			return errNotExist
-		}
-		vlog.Errorf("Rename(%v, %v) failed: %v", instanceDir, stoppedInstanceDir, err)
-		return errOperationFailed
-	}
-	// TODO(caprita): restore the instance to unstopped upon failure?
-
-	info, err := loadInstanceInfo(stoppedInstanceDir)
-	if err != nil {
-		return errOperationFailed
-	}
 	return stopAppRemotely(info.AppCycleMgrName)
 }
 
-func (*appInvoker) Suspend(ipc.ServerContext) error {
-	// TODO(jsimsa): Implement.
-	return nil
+// TODO(caprita): implement deadline for Stop.
+
+func (i *appInvoker) Stop(_ ipc.ServerContext, deadline uint32) error {
+	instanceDir, err := i.instanceDir()
+	if err != nil {
+		return err
+	}
+	if err := transition(instanceDir, suspended, stopped); err == errOperationFailed || err == nil {
+		return err
+	}
+	if err := transition(instanceDir, started, stopping); err != nil {
+		return err
+	}
+	if err := stop(instanceDir); err != nil {
+		transition(instanceDir, stopping, started)
+		return err
+	}
+	return transition(instanceDir, stopping, stopped)
+}
+
+func (i *appInvoker) Suspend(ipc.ServerContext) error {
+	instanceDir, err := i.instanceDir()
+	if err != nil {
+		return err
+	}
+	if err := transition(instanceDir, started, suspending); err != nil {
+		return err
+	}
+	if err := stop(instanceDir); err != nil {
+		transition(instanceDir, suspending, started)
+		return err
+	}
+	return transition(instanceDir, suspending, suspended)
 }
 
 func (*appInvoker) Uninstall(ipc.ServerContext) error {
diff --git a/services/mgmt/node/impl/impl_test.go b/services/mgmt/node/impl/impl_test.go
index a7aa441..6d217b2 100644
--- a/services/mgmt/node/impl/impl_test.go
+++ b/services/mgmt/node/impl/impl_test.go
@@ -480,6 +480,32 @@
 	}
 }
 
+func suspendApp(t *testing.T, appID, instanceID string) {
+	appsName := "nm//apps"
+	appName := naming.Join(appsName, appID)
+	instanceName := naming.Join(appName, instanceID)
+	stub, err := node.BindApplication(instanceName)
+	if err != nil {
+		t.Fatalf("BindApplication(%v) failed: %v", instanceName, err)
+	}
+	if err := stub.Suspend(rt.R().NewContext()); err != nil {
+		t.Fatalf("Suspend failed: %v", err)
+	}
+}
+
+func resumeApp(t *testing.T, appID, instanceID string) {
+	appsName := "nm//apps"
+	appName := naming.Join(appsName, appID)
+	instanceName := naming.Join(appName, instanceID)
+	stub, err := node.BindApplication(instanceName)
+	if err != nil {
+		t.Fatalf("BindApplication(%v) failed: %v", instanceName, err)
+	}
+	if err := stub.Resume(rt.R().NewContext()); err != nil {
+		t.Fatalf("Resume failed: %v", err)
+	}
+}
+
 func verifyAppWorkspace(t *testing.T, root, appID, instanceID string) {
 	// HACK ALERT: for now, we peek inside the node manager's directory
 	// structure (which ought to be opaque) to check for what the app has
@@ -495,7 +521,7 @@
 	}
 	components := strings.Split(appID, "/")
 	appTitle, installationID := components[0], components[1]
-	instanceDir := filepath.Join(root, applicationDirName(appTitle), "installation-"+installationID, "instances", "stopped-instance-"+instanceID)
+	instanceDir := filepath.Join(root, applicationDirName(appTitle), "installation-"+installationID, "instances", "instance-"+instanceID)
 	rootDir := filepath.Join(instanceDir, "root")
 	testFile := filepath.Join(rootDir, "testfile")
 	if read, err := ioutil.ReadFile(testFile); err != nil {
@@ -549,6 +575,13 @@
 	instanceID := startApp(t, appID)
 	<-pingCh // Wait until the app pings us that it's ready.
 
+	// Suspend the app.
+	suspendApp(t, appID, instanceID)
+	<-pingCh // App should have pinged us before it terminated.
+
+	resumeApp(t, appID, instanceID)
+	<-pingCh
+
 	// TODO(caprita): test Suspend and Resume, and verify various
 	// non-standard combinations (suspend when stopped; resume while still
 	// running; stop while suspended).