lib/raft/raft.go - release.go.x.ref - Git at Google

 // Copyright 2015 The Vanadium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package raft

 // This package implements the Raft protocol, https://ramcloud.stanford.edu/raft.pdf. The
 // logged commands are strings.   If someone wishes a more complex command structure, they
 // should use an encoding (e.g. json) into the strings.

 import (
 	"io"
 	"math/rand"
 	"sort"
 	"sync"
 	"time"

 	"v.io/x/lib/vlog"

 	"v.io/v23"
 	"v.io/v23/context"
 	"v.io/v23/naming"
 	"v.io/v23/options"
 	"v.io/v23/verror"
 )

 const pkgPath = "v.io/x/ref/lib.raft"

 var (
 	errBadAppend     = verror.Register(pkgPath+".errBadAppend", verror.NoRetry, "{1:}{2:} inconsistent append{:_}")
 	errAddAfterStart = verror.Register(pkgPath+".errAddAfterStart", verror.NoRetry, "{1:}{2:} adding member after start{:_}")
 	errNotLeader     = verror.Register(pkgPath+".errNotLeader", verror.NoRetry, "{1:}{2:} not the leader{:_}")
 	errWTF           = verror.Register(pkgPath+".errWTF", verror.NoRetry, "{1:}{2:} internal error{:_}")
 	errTimedOut      = verror.Register(pkgPath+".errTimedOut", verror.NoRetry, "{1:}{2:} request timed out{:_}")
 	errBadTerm       = verror.Register(pkgPath+".errBadTerm", verror.NoRetry, "{1:}{2:} new term {3} < {4} {:_}")
 )

 // member keeps track of another member's state.
 type member struct {
 	id         string
 	nextIndex  Index         // Next log index to send to this follower.
 	matchIndex Index         // Last entry logged by this follower.
 	stopped    chan struct{} // Follower go routine closes this to indicate it has terminated.
 	update     chan struct{}
 	timer      *time.Timer
 }

 // memberSlice is used for sorting members by highest logged (matched) entry.
 type memberSlice []*member

 func (m memberSlice) Len() int           { return len(m) }
 func (m memberSlice) Less(i, j int) bool { return m[i].matchIndex > m[j].matchIndex }
 func (m memberSlice) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }

 // raft is the implementation of the raft library.
 type raft struct {
 	sync.Mutex

 	ctx       *context.T
 	cancel    context.CancelFunc
 	rng       *rand.Rand
 	timer     *time.Timer
 	heartbeat time.Duration

 	// rpc interface between instances.
 	s service

 	// Client interface.
 	client RaftClient

 	// applied is the highest log entry applied to the client.
 	applied struct {
 		index Index
 		term  Term
 	}

 	// Raft algorithm volatile state.
 	role        int
 	leader      string
 	quorum      int                // Number of members that form a quorum.
 	commitIndex Index              // Highest index commited.
 	memberMap   map[string]*member // Map of raft members (including current).
 	memberSet   memberSlice        // Slice of raft members (including current).
 	me          *member

 	// Raft algorithm persistent state
 	p      persistent
 	logDir string

 	// stop and stopped are for clean shutdown.  All long lived go routines (perFollower and serverEvents)
 	// exit when stop is closed.  Each perFollower then closes member.stopped and serverEvents closes
 	// stopped to signal that they are finished.
 	stop    chan struct{} // perFollower and serverEvents go routines exit when this is closed.
 	stopped chan struct{} // serverEvents go routine closes this to indicate it has terminated.

 	// Each time a perFollower successfully logs entries on a follower it writes to newMatch to get the
 	// serverEvents routine to possibly update the commitIndex.
 	newMatch chan struct{} // Each time a follower reports a logged entry a message is sent on this.

 	// Each time a follower receives a new commit index, it sends it to newCommit to get the serverEvents
 	// routine to apply any newly committed entries.
 	newCommit chan Index // Each received leader commit index is written to this.

 	// Wait here for commitIndex to change.
 	ccv *sync.Cond

 	// Wait here for leadership to change.
 	lcv *sync.Cond

 	// Variables for the sync loop.
 	sync struct {
 		sync.Mutex
 		requested   uint64 // Incremented each sync request.
 		requestedcv *sync.Cond
 		done        uint64 // Updated to last request prior to the current sync.
 		donecv      *sync.Cond
 		stopped     chan struct{}
 	}
 }

 // logentry is the in memory structure for each logged item.  It is
 type logEntry struct {
 	Term       Term
 	Index      Index
 	Cmd        []byte
 	Type       byte
 	ApplyError error
 }

 // newRaft creates a new raft server.
 //  logDir        - the name of the directory in which to persist the log.
 //  serverName    - a name for the server to announce itself as in a mount table.  All members should use the
 //                 same name and hence be alternatives for callers.
 //  hostPort      - the network address of the server
 //  hb            - the interval between heartbeats.  0 means use default.
 //  snapshotThreshold - the size the log can reach before we create a snapshot.  0 means use default.
 //  client        - callbacks to the client.
 func newRaft(ctx *context.T, config *RaftConfig, client RaftClient) (*raft, error) {
 	nctx, cancel := context.WithCancel(ctx)
 	r := &raft{}
 	r.ctx = nctx
 	r.cancel = cancel
 	r.rng = rand.New(rand.NewSource(time.Now().UnixNano()))
 	r.heartbeat = config.Heartbeat
 	if r.heartbeat == 0 {
 		r.heartbeat = 3 * time.Second
 	}

 	// Client interface.
 	r.client = client
 	r.applied.term = 0
 	r.applied.index = 0

 	// Raft volatile state.
 	r.role = RoleStopped
 	r.commitIndex = 0
 	r.leader = ""
 	r.memberMap = make(map[string]*member)
 	r.memberSet = make([]*member, 0)
 	r.AddMember(ctx, config.HostPort)
 	r.me = r.memberMap[config.HostPort]

 	// Raft persistent state.
 	var err error
 	r.logDir = config.LogDir
 	if r.p, err = openPersist(ctx, r, config.SnapshotThreshold); err != nil {
 		return nil, err
 	}

 	// Internal communication/synchronization.
 	r.newMatch = make(chan struct{}, 100)
 	r.newCommit = make(chan Index, 100)
 	r.ccv = sync.NewCond(r)
 	r.lcv = sync.NewCond(r)
 	r.sync.donecv = sync.NewCond(&r.sync)
 	r.sync.requestedcv = sync.NewCond(&r.sync)

 	// The RPC interface to other members.
 	eps, err := r.s.newService(nctx, r, config.ServerName, config.HostPort, config.Acl)
 	if err != nil {
 		return nil, err
 	}

 	// If we're in the V namespace, just use the name as our Id.  If not create one
 	// from the network address.
 	r.me.id = config.ServerName
 	if r.me.id == "" {
 		r.me.id = string(getShortName(eps[0]))
 	}

 	return r, nil
 }

 // getShortName will return a /host:port name if possible.  Otherwise it will just return the name
 // version of the endpoint.
 func getShortName(ep naming.Endpoint) string {
 	if ep.Addr().Network() != "tcp" {
 		return ep.Name()
 	}
 	return naming.JoinAddressName(ep.Addr().String(), "")
 }

 // AddMember adds the id as a raft member.  The id must be a vanadium name.
 func (r *raft) AddMember(ctx *context.T, id string) error {
 	if r.role != RoleStopped {
 		// Already started.
 		// TODO(p): Raft has a protocol for changing membership after
 		// start.  I'll add that after I get at least one client
 		// working.
 		return verror.New(errAddAfterStart, ctx)
 	}
 	m := &member{id: id, stopped: make(chan struct{}), update: make(chan struct{}, 10)}
 	r.memberMap[id] = m
 	r.memberSet = append(r.memberSet, m)
 	// Quorum has to be more than half the servers.
 	r.quorum = (len(r.memberSet) + 1) / 2
 	return nil
 }

 // Id returns the vanadium name of this server.
 func (r *raft) Id() string {
 	return r.me.id
 }

 // Start gets the protocol going.
 func (r *raft) Start() {
 	vlog.Infof("@%s starting", r.me.id)
 	r.Lock()
 	defer r.Unlock()
 	if r.role != RoleStopped {
 		// already started
 		return
 	}
 	r.timer = time.NewTimer(2 * r.heartbeat)

 	// serverEvents serializes events for this server.
 	r.stop = make(chan struct{})
 	r.stopped = make(chan struct{})
 	go r.serverEvents()

 	// syncLoop syncs with the leader when needed.
 	r.sync.stopped = make(chan struct{})
 	go r.syncLoop()

 	// perFollowers updates the followers when we're the leader.
 	for _, m := range r.memberSet {
 		if m.id != r.me.id {
 			go r.perFollower(m)
 		}
 	}
 	return
 }

 // Stop ceases all function as a raft server.
 func (r *raft) Stop() {
 	vlog.Infof("@%s stopping", r.me.id)
 	r.Lock()
 	if r.role == RoleStopped {
 		r.Unlock()
 		r.cancel() // in case *r never got out of RoleStopped
 		return
 	}
 	r.role = RoleStopped
 	r.Unlock()
 	r.cancel()

 	// Stop the associated go routines.
 	close(r.stop)

 	// Wait for serverEvents to stop.
 	<-r.stopped

 	// Wait for syncLoop to stop.
 	r.sync.donecv.Broadcast()
 	r.sync.requestedcv.Broadcast()
 	<-r.sync.stopped

 	// Wait for all the perFollower routines to stop.
 	for _, m := range r.memberSet {
 		if m.id != r.me.id {
 			<-m.stopped
 		}
 	}

 	// Shut down the log file.
 	r.p.Close()

 	vlog.Infof("@%s stopping service", r.me.id)
 	<-r.s.server.Closed()
 	vlog.Infof("@%s stopped", r.me.id)
 }

 // setRoleAndWatchdogTimer called with r.l locked.
 func (r *raft) setRoleAndWatchdogTimer(role int) {
 	vlog.VI(2).Infof("@%s %s->%s", r.me.id, roleToString(r.role), roleToString(role))
 	r.role = role
 	switch role {
 	case RoleFollower:
 		// Wake up any RaftProto.Append()s waiting for a commitment.  They
 		// will now have to give up since we are no longer leader.
 		r.ccv.Broadcast()
 		// Set a timer to start an election if we no longer hear from the leader.
 		r.resetTimerFuzzy(2 * r.heartbeat)
 	case RoleLeader:
 		r.leader = r.me.id

 		// Set known follower status to default values.
 		for _, m := range r.memberSet {
 			if m.id != r.me.id {
 				m.nextIndex = r.p.LastIndex() + 1
 				m.matchIndex = 0
 			}
 		}

 		// Set my match index to the last one logged.
 		r.setMatchIndex(r.me, r.p.LastIndex())

 		// Let waiters know a new leader exists.
 		r.lcv.Broadcast()
 	case RoleCandidate:
 		// If this goes off, we lost an election and need to start a new one.
 		// We make it longer than the follower timeout because we make have
 		// lost due to safety so give someone else a chance.
 		r.resetTimerFuzzy(4 * r.heartbeat)
 	}
 }

 // setRole called with r.l locked.
 func (r *raft) setRole(role int) {
 	vlog.VI(2).Infof("@%s %s->%s", r.me.id, roleToString(r.role), roleToString(role))
 	r.role = role
 }

 func (r *raft) appendNull() {
 	// Assign an index and term to the log entry.
 	le := LogEntry{Term: r.p.CurrentTerm(), Index: r.p.LastIndex() + 1, Cmd: nil, Type: RaftEntry}

 	// Append to our own log.
 	if err := r.p.AppendToLog(r.ctx, r.p.LastTerm(), r.p.LastIndex(), []LogEntry{le}); err != nil {
 		// This shouldn't happen.
 		return
 	}

 	// Update the fact that we've logged it.
 	r.setMatchIndex(r.me, le.Index)
 	r.kickFollowers()
 }

 // Status returns the current member's id, its raft role, and who it thinks is leader.
 func (r *raft) Status() (string, int, string) {
 	r.Lock()
 	defer r.Unlock()
 	return r.me.id, r.role, r.leader
 }

 // StartElection starts a new round of voting.  We do this by incrementing the
 // Term and, in parallel, calling each other member to vote.  If we receive a
 // majority we win and send a heartbeat to each member.
 //
 // Someone else many get elected in the middle of the vote so we have to
 // make sure we're still a candidate at the end of the voting.
 func (r *raft) StartElection() {
 	r.Lock()
 	defer r.Unlock()
 	r.startElection()
 }

 func (r *raft) startElection() {
 	// If we can't get a response in 2 seconds, something is really wrong.
 	ctx, cancel := context.WithTimeout(r.ctx, 2*time.Second)
 	defer cancel()
 	if err := r.p.IncCurrentTerm(); err != nil {
 		// If this fails, there's no way to recover.
 		vlog.Fatalf("incrementing current term: %s", err)
 		return
 	}
 	vlog.Infof("@%s startElection new term %d", r.me.id, r.p.CurrentTerm())

 	msg := []interface{}{
 		r.p.CurrentTerm(),
 		r.me.id,
 		r.p.LastTerm(),
 		r.p.LastIndex(),
 	}
 	var members []string
 	for k, m := range r.memberMap {
 		if m.id == r.me.id {
 			continue
 		}
 		members = append(members, k)
 	}
 	r.setRoleAndWatchdogTimer(RoleCandidate)
 	r.p.SetVotedFor(r.me.id)
 	r.leader = ""
 	r.Unlock()

 	// We have to do this outside the lock or the system will deadlock when two members start overlapping votes.
 	type reply struct {
 		term Term
 		ok   bool
 	}
 	c := make(chan reply)
 	for _, id := range members {
 		go func(id string) {
 			var rep reply
 			client := v23.GetClient(ctx)
 			if err := client.Call(ctx, id, "RequestVote", msg, []interface{}{&rep.term, &rep.ok}, options.Preresolved{}); err != nil {
 				vlog.Infof("@%s sending RequestVote to %s: %s", r.me.id, id, err)
 			}
 			c <- rep
 		}(id)
 	}

 	// Wait till all the voters have voted or timed out.
 	oks := 1 // We vote for ourselves.
 	highest := Term(0)
 	for range members {
 		rep := <-c
 		if rep.ok {
 			oks++
 		}
 		if rep.term > highest {
 			highest = rep.term
 		}
 	}

 	r.Lock()
 	// We have to check the role since someone else may have become the leader during the round and
 	// made us a follower.
 	if oks <= len(members)/2 || r.role != RoleCandidate {
 		if highest > r.p.CurrentTerm() {
 			// If someone answered with a higher term, stop being a candidate.
 			r.setRoleAndWatchdogTimer(RoleFollower)
 			r.p.SetCurrentTerm(highest)
 		}
 		vlog.VI(2).Infof("@%s lost election with %d votes", r.me.id, oks)
 		return
 	}
 	vlog.Infof("@%s won election with %d votes", r.me.id, oks)
 	r.setRoleAndWatchdogTimer(RoleLeader)

 	// Tell followers we are now the leader.
 	r.appendNull()

 }

 // applyCommits applies any committed entries.
 func (r *raft) applyCommits(commitIndex Index) {
 	for r.applied.index < commitIndex {
 		// This is the only go routine that changes r.applied
 		// so we don't have to protect our reads.
 		next := r.applied.index + 1
 		le := r.p.Lookup(next)
 		if le == nil {
 			// Commit index is ahead of our highest entry.
 			return
 		}
 		switch le.Type {
 		case ClientEntry:
 			le.ApplyError = r.client.Apply(le.Cmd, le.Index)
 		case RaftEntry:
 		}

 		// But we do have to lock our writes.
 		r.Lock()
 		r.applied.index = next
 		r.applied.term = le.Term
 		r.Unlock()
 	}

 	r.p.ConsiderSnapshot(r.ctx, r.applied.term, r.applied.index)
 }

 func (r *raft) lastApplied() Index {
 	r.Lock()
 	defer r.Unlock()
 	return r.applied.index
 }

 func (r *raft) resetTimerFuzzy(d time.Duration) {
 	fuzz := time.Duration(rand.Int63n(int64(r.heartbeat)))
 	r.timer.Reset(d + fuzz)
 }

 func (r *raft) resetTimer(d time.Duration) {
 	r.timer.Reset(d)
 }

 func highestFromChan(i Index, c chan Index) Index {
 	for {
 		select {
 		case j := <-c:
 			if j > i {
 				i = j
 			}
 		default:
 			return i
 		}
 	}
 }

 // serverEvents is a go routine that serializes server events.  This loop performs:
 // (1) all changes to commitIndex both as a leader and a follower.
 // (2) all application of committed log commands.
 // (3) all elections.
 func (r *raft) serverEvents() {
 	r.Lock()
 	r.setRoleAndWatchdogTimer(RoleFollower)
 	r.Unlock()
 	for {
 		select {
 		case <-r.stop:
 			// Terminate.
 			close(r.stopped)
 			return
 		case <-r.timer.C:
 			// Start an election whenever either:
 			// (1) a follower hasn't heard from the leader in a random interval > 2 * heartbeat.
 			// (2) a candidate hasn't won an election or been told anyone else has after hearbeat.
 			r.Lock()
 			switch r.role {
 			case RoleCandidate:
 				r.startElection()
 			case RoleFollower:
 				r.startElection()
 			}
 			r.Unlock()
 		case <-r.newMatch:
 			// Soak up any queued requests.
 			emptyChan(r.newMatch)

 			// This happens whenever we have gotten a reply from a follower.  We do it
 			// here rather than in perFollower solely as a matter of taste.
 			// Update the commitIndex if needed and apply any newly committed entries.
 			r.Lock()
 			if r.role != RoleLeader {
 				r.Unlock()
 				continue
 			}
 			sort.Sort(r.memberSet)
 			ci := r.memberSet[r.quorum-1].matchIndex
 			if ci <= r.commitIndex {
 				r.Unlock()
 				continue
 			}
 			r.commitIndex = ci
 			r.Unlock()
 			r.applyCommits(ci)
 			r.ccv.Broadcast()
 			r.kickFollowers()
 		case i := <-r.newCommit:
 			// Get highest queued up commit.
 			i = highestFromChan(i, r.newCommit)

 			// Update the commitIndex if needed and apply any newly committed entries.
 			r.Lock()
 			if r.role != RoleFollower {
 				r.Unlock()
 				continue
 			}
 			if i > r.commitIndex {
 				r.commitIndex = i
 			}
 			ci := r.commitIndex
 			r.Unlock()
 			r.applyCommits(ci)
 			r.ccv.Broadcast()
 		}
 	}
 }

 // makeAppendMsg creates an append message at most 10 entries long.
 func (r *raft) makeAppendMsg(m *member) ([]interface{}, int) {
 	// Figure out if we know the previous entry.
 	prevTerm, prevIndex, ok := r.p.LookupPrevious(m.nextIndex)
 	if !ok {
 		return nil, 0
 	}
 	// Collect some log entries to send along.  0 is ok.
 	var entries []LogEntry
 	for i := 0; i < 10; i++ {
 		le := r.p.Lookup(m.nextIndex + Index(i))
 		if le == nil {
 			break
 		}
 		entries = append(entries, LogEntry{Cmd: le.Cmd, Term: le.Term, Index: le.Index, Type: le.Type})
 	}
 	return []interface{}{
 		r.p.CurrentTerm(),
 		r.me.id,
 		prevIndex,
 		prevTerm,
 		r.commitIndex,
 		entries,
 	}, len(entries)
 }

 // updateFollower loops trying to update a follower until the follower is updated or we can't proceed.
 // It will always send at least one update so will also act as a heartbeat.
 func (r *raft) updateFollower(m *member) {
 	// Bring this server up to date.
 	r.Lock()
 	defer r.Unlock()
 	for {
 		// If we're not the leader we have no followers.
 		if r.role != RoleLeader {
 			return
 		}

 		// Collect some log entries starting at m.nextIndex.
 		msg, n := r.makeAppendMsg(m)
 		if msg == nil {
 			// Try sending a snapshot.
 			r.Unlock()
 			vlog.Infof("@%s sending snapshot to %s", r.me.id, m.id)
 			snapIndex, err := r.sendLatestSnapshot(m)
 			r.Lock()
 			if err != nil {
 				// Try again later.
 				vlog.Errorf("@%s sending snapshot to %s: %s", r.me.id, m.id, err)
 				return
 			}
 			m.nextIndex = snapIndex + 1
 			vlog.Infof("@%s sent snapshot to %s", r.me.id, m.id)
 			// Try sending anything following the snapshot.
 			continue
 		}

 		// Send to the follower. We drop the lock while we do this. That means we may stop being the
 		// leader in the middle of the call but that's OK as long as we check when we get it back.
 		r.Unlock()
 		ctx, cancel := context.WithTimeout(r.ctx, time.Duration(2)*time.Second)
 		client := v23.GetClient(ctx)
 		err := client.Call(ctx, m.id, "AppendToLog", msg, []interface{}{}, options.Preresolved{})
 		cancel()
 		r.Lock()
 		if r.role != RoleLeader {
 			// Not leader any more, doesn't matter how he replied.
 			return
 		}

 		if err != nil {
 			if verror.ErrorID(err) != errOutOfSequence.ID {
 				// A problem other than missing entries.  Retry later.
 				//vlog.Errorf("@%s updating %s: %s", r.me.id, m.id, err)
 				vlog.Errorf("@%s updating %s: %s", r.me.id, m.id, err)
 				return
 			}
 			// At this point we know that the follower is missing entries pervious to what
 			// we just sent.  If we can backup, do it.  Otherwise try sending a snapshot.
 			if m.nextIndex <= 1 {
 				return
 			}
 			prev := r.p.Lookup(m.nextIndex - 1)
 			if prev == nil {
 				return
 			}
 			// We can back up.
 			m.nextIndex = m.nextIndex - 1
 			continue
 		}

 		// The follower appended correctly, update indices and tell the server thread that
 		// the commit index may need to change.
 		m.nextIndex += Index(n)
 		logged := m.nextIndex - 1
 		if n > 0 {
 			r.setMatchIndex(m, logged)
 		}

 		// The follower is caught up?
 		if m.nextIndex > r.p.LastIndex() {
 			return
 		}
 	}
 }

 func (r *raft) sendLatestSnapshot(m *member) (Index, error) {
 	rd, term, index, err := r.p.OpenLatestSnapshot(r.ctx)
 	if err != nil {
 		return 0, err
 	}
 	ctx, cancel := context.WithTimeout(r.ctx, time.Duration(5*60)*time.Second)
 	defer cancel()
 	client := raftProtoClient(m.id)
 	call, err := client.InstallSnapshot(ctx, r.p.CurrentTerm(), r.me.id, term, index, options.Preresolved{})
 	if err != nil {
 		return 0, err
 	}
 	sstream := call.SendStream()
 	b := make([]byte, 10240)
 	for {
 		n, err := rd.Read(b)
 		if n == 0 && err == io.EOF {
 			break
 		}
 		if err = sstream.Send(b); err != nil {
 			return 0, err
 		}
 	}
 	if err := call.Finish(); err != nil {
 		return 0, err
 	}
 	return index, nil
 }

 func emptyChan(c chan struct{}) {
 	for {
 		select {
 		case <-c:
 		default:
 			return
 		}
 	}
 }

 // perFollower is a go routine that sequences all messages to a single follower.
 //
 // This is the only go routine that updates the follower's variables so all changes to
 // the member struct are serialized by it.
 func (r *raft) perFollower(m *member) {
 	m.timer = time.NewTimer(r.heartbeat)
 	for {
 		select {
 		case <-m.timer.C:
 			r.updateFollower(m)
 			m.timer.Reset(r.heartbeat)
 		case <-m.update:
 			// Soak up any waiting update requests
 			emptyChan(m.update)
 			r.updateFollower(m)
 			m.timer.Reset(r.heartbeat)
 		case <-r.stop:
 			close(m.stopped)
 			return
 		}
 	}
 }

 // kickFollowers causes each perFollower routine to try to update its followers.
 func (r *raft) kickFollowers() {
 	for _, m := range r.memberMap {
 		select {
 		case m.update <- struct{}{}:
 		default:
 		}
 	}
 }

 // setMatchIndex updates the matchIndex for a member.
 //
 // called with r locked.
 func (r *raft) setMatchIndex(m *member, i Index) {
 	m.matchIndex = i
 	if i <= r.commitIndex {
 		return
 	}
 	// Check if we need to change the commit index.
 	select {
 	case r.newMatch <- struct{}{}:
 	default:
 	}
 }

 func minIndex(indices ...Index) Index {
 	if len(indices) == 0 {
 		return 0
 	}
 	min := indices[0]
 	for _, x := range indices[1:] {
 		if x < min {
 			min = x
 		}
 	}
 	return min
 }

 func roleToString(r int) string {
 	switch r {
 	case RoleCandidate:
 		return "candidate"
 	case RoleLeader:
 		return "leader"
 	case RoleFollower:
 		return "follower"
 	}
 	return "?"
 }

 func (r *raft) waitForApply(ctx *context.T, term Term, index Index) (error, error) {
 	r.Lock()
 	defer r.Unlock()
 	for {
 		if r.applied.index >= index {
 			if term == 0 {
 				// Special case: we don't care about Apply() error or committed term, only that we've reached index.
 				return nil, nil
 			}
 			le := r.p.Lookup(index)
 			if le == nil || le.Term != term {
 				// There was an election and the log entry was lost.
 				return nil, verror.New(errNotLeader, ctx)
 			}
 			return le.ApplyError, nil
 		}

 		// Give up if the caller doesn't want to wait.
 		select {
 		case <-ctx.Done():
 			return nil, verror.New(errTimedOut, ctx)
 		default:
 		}

 		// Wait for an apply to happen.  r will be unlocked during the wait.
 		r.ccv.Wait()
 	}
 }

 // waitForLeadership waits until there is an elected leader.
 func (r *raft) waitForLeadership(ctx *context.T) (string, int, Index, bool) {
 	r.Lock()
 	defer r.Unlock()
 	for len(r.leader) == 0 {
 		// Give up if the caller doesn't want to wait.
 		select {
 		case <-ctx.Done():
 			return "", 0, 0, true
 		default:
 		}
 		r.lcv.Wait()
 	}
 	return r.leader, r.role, r.commitIndex, false
 }

 // Append tells the leader to append to the log.  The first error is the result of the client.Apply.  The second
 // is any error from raft.
 func (r *raft) Append(ctx *context.T, cmd []byte) (error, error) {
 	for {
 		leader, role, _, timedOut := r.waitForLeadership(ctx)
 		if timedOut {
 			return nil, verror.New(errTimedOut, ctx)
 		}
 		switch role {
 		case RoleLeader:
 			term, index, err := r.s.Append(ctx, nil, cmd)
 			if err == nil {
 				// We were the leader and the entry has now been applied.
 				return r.waitForApply(ctx, term, index)
 			}
 			// If the leader can't do it, give up.
 			if verror.ErrorID(err) != errNotLeader.ID {
 				return nil, err
 			}
 		case RoleFollower:
 			client := v23.GetClient(ctx)
 			var index Index
 			var term Term
 			if len(leader) == 0 {
 				break
 			}
 			err := client.Call(ctx, leader, "Append", []interface{}{cmd}, []interface{}{&term, &index}, options.Preresolved{})
 			if err == nil {
 				return r.waitForApply(ctx, term, index)
 			}
 			// If the leader can't do it, give up.
 			if verror.ErrorID(err) != errNotLeader.ID {
 				return nil, err
 			}
 		}

 		// Give up if the caller doesn't want to wait.
 		select {
 		case <-ctx.Done():
 			err := verror.New(errTimedOut, ctx)
 			return nil, err
 		default:
 		}
 	}
 }

 func (r *raft) Leader() (bool, string) {
 	r.Lock()
 	defer r.Unlock()
 	if r.role == RoleLeader {
 		return true, r.leader
 	}
 	return false, r.leader
 }

 // syncWithLeader synchronizes with the leader.  On return we have applied the commit index
 // that existed before the call.
 func (r *raft) syncWithLeader(ctx *context.T) error {
 	for {
 		leader, role, commitIndex, timedOut := r.waitForLeadership(ctx)
 		if timedOut {
 			return verror.New(errTimedOut, ctx)
 		}

 		switch role {
 		case RoleLeader:
 			r.waitForApply(ctx, 0, commitIndex)
 			return nil
 		case RoleFollower:
 			client := v23.GetClient(ctx)
 			var index Index
 			err := client.Call(ctx, leader, "Committed", []interface{}{}, []interface{}{&index}, options.Preresolved{})
 			if err == nil {
 				r.waitForApply(ctx, 0, index)
 				return nil
 			}
 			// If the leader can't do it, give up.
 			if verror.ErrorID(err) != errNotLeader.ID {
 				return err
 			}
 		}

 		// Give up if the caller doesn't want to wait.
 		select {
 		case <-ctx.Done():
 			return verror.New(errTimedOut, ctx)
 		default:
 		}
 	}
 }

 // syncLoop is a go routine that syncs whenever necessary with the leader.
 func (r *raft) syncLoop() {
 	for {
 		// Wait for someone to request syncing.
 		r.sync.Lock()
 		for r.sync.requested <= r.sync.done {
 			select {
 			case <-r.stop:
 				close(r.sync.stopped)
 				r.sync.Unlock()
 				return
 			default:
 			}
 			r.sync.requestedcv.Wait()
 		}
 		requested := r.sync.requested
 		r.sync.Unlock()

 		// Perform the sync outside the lock.
 		if err := r.syncWithLeader(r.ctx); err != nil {
 			continue
 		}

 		// Wake up waiters.
 		r.sync.Lock()
 		r.sync.done = requested
 		r.sync.Unlock()
 		r.sync.donecv.Broadcast()
 	}
 }

 // Sync waits for this member to have applied the current commit indexl.
 func (r *raft) Sync(ctx *context.T) error {
 	r.sync.Lock()
 	defer r.sync.Unlock()
 	r.sync.requested++
 	requested := r.sync.requested
 	r.sync.requestedcv.Broadcast()
 	// Wait for our sync to complete.
 	for requested > r.sync.done {
 		select {
 		case <-r.stop:
 			return verror.New(errTimedOut, ctx)
 		case <-ctx.Done():
 			return verror.New(errTimedOut, ctx)
 		default:
 		}
 		r.sync.donecv.Wait()
 	}
 	return nil
 }