services/syncbase/vsync/watcher.go - release.go.x.ref - Git at Google

 // Copyright 2015 The Vanadium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package vsync

 // Syncbase Watcher is a goroutine that listens to local Database updates from
 // applications and modifies sync metadata (e.g. DAG and local log records).
 // The coupling between Syncbase storage and sync is loose, via asynchronous
 // listening by the Watcher, to unblock the application operations as soon as
 // possible, and offload the sync metadata update to the Watcher.  When the
 // application mutates objects in a Database, additional entries are written
 // to a log queue, persisted in the same Database.  This queue is read by the
 // sync Watcher to learn of the changes.

 import (
 	"fmt"
 	"strings"

 	"v.io/v23/context"
 	wire "v.io/v23/services/syncbase"
 	"v.io/v23/services/watch"
 	"v.io/v23/verror"
 	"v.io/x/lib/vlog"
 	"v.io/x/ref/services/syncbase/common"
 	"v.io/x/ref/services/syncbase/server/interfaces"
 	"v.io/x/ref/services/syncbase/store"
 	"v.io/x/ref/services/syncbase/store/watchable"
 	sbwatchable "v.io/x/ref/services/syncbase/watchable"
 )

 var (
 	// watchPrefixes is an in-memory cache of syncgroup prefixes for each
 	// database.  It is filled at startup from persisted syncgroup data
 	// and updated at runtime when syncgroups are joined or left.  It is
 	// not guarded by a mutex because only the watcher goroutine uses it
 	// beyond the startup phase (before any sync goroutines are started).
 	// The map keys are the database ids (globally unique).
 	watchPrefixes = make(map[wire.Id]sgPrefixes)
 )

 // sgPrefixes tracks syncgroup prefixes being synced in a database and their
 // syncgroups.
 type sgPrefixes map[string]sgSet

 // StartStoreWatcher starts a Sync goroutine to watch the database store for
 // updates and process them into the Sync subsystem.  This function is called
 // when a database is created or reopened during service restart, thus spawning
 // a Sync watcher for each database.
 func (sd *syncDatabase) StartStoreWatcher(ctx *context.T) {
 	s := sd.sync.(*syncService)
 	dbId, st := sd.db.Id(), sd.db.St()

 	s.pending.Add(1)
 	ctx, cancel := context.WithCancel(ctx)

 	go func() {
 		s.watchStore(ctx, dbId, st)
 		cancel()
 		s.pending.Done()
 	}()
 }

 // watchStore processes updates obtained by watching the store.  This is the
 // sync watcher goroutine that learns about store updates asynchronously by
 // reading log records that track object mutation histories in each database.
 // For each batch mutation, the watcher updates the sync DAG and log records.
 // When an application makes a single non-transactional put, it is represented
 // as a batch of one log record. Thus the watcher only deals with batches.
 func (s *syncService) watchStore(ctx *context.T, dbId wire.Id, st *watchable.Store) {
 	vlog.VI(1).Infof("sync: watchStore: DB %v: start watching updates", dbId)

 	updatesChan, cancel := watchable.WatchUpdates(st)
 	defer cancel()

 	moreWork := true
 	for moreWork && !s.isClosed() {
 		if s.processDatabase(ctx, dbId, st) {
 			vlog.VI(2).Infof("sync: watchStore: DB %v: had updates", dbId)
 		} else {
 			vlog.VI(2).Infof("sync: watchStore: DB %v: idle, wait for updates", dbId)
 			select {
 			case _, moreWork = <-updatesChan:

 			case <-s.closed:
 				moreWork = false
 			}
 		}
 	}

 	vlog.VI(1).Infof("sync: watchStore: DB %v: channel closed, stop watching and exit", dbId)
 }

 // processDatabase fetches from the given database at most one new batch update
 // (transaction) and processes it.  A batch is stored as a contiguous set of log
 // records ending with one record having the "continued" flag set to false.  The
 // call returns true if a new batch update was processed.
 func (s *syncService) processDatabase(ctx *context.T, dbId wire.Id, st store.Store) bool {
 	s.thLock.Lock()
 	defer s.thLock.Unlock()

 	vlog.VI(2).Infof("sync: processDatabase: begin: %v", dbId)
 	defer vlog.VI(2).Infof("sync: processDatabase: end: %v", dbId)

 	resMark, err := getResMark(ctx, st)
 	if err != nil {
 		if verror.ErrorID(err) != verror.ErrNoExist.ID {
 			vlog.Errorf("sync: processDatabase: %v: cannot get resMark: %v", dbId, err)
 			return false
 		}
 		resMark = watchable.MakeResumeMarker(0)
 	}

 	// Initialize Database sync state if needed.
 	s.initSyncStateInMem(ctx, dbId, "")

 	// Get a batch of watch log entries, if any, after this resume marker.
 	logs, nextResmark, err := watchable.ReadBatchFromLog(st, resMark)
 	if err != nil {
 		// An error here (scan stream cancelled) is possible when the watcher is in
 		// the middle of processing a database when it is destroyed. Hence, we just
 		// ignore this database and proceed.
 		vlog.Errorf("sync: processDatabase: %v: cannot get watch log batch: %v", dbId, verror.DebugString(err))
 		return false
 	} else if logs == nil {
 		return false
 	}

 	if err = s.processWatchLogBatch(ctx, dbId, st, logs, nextResmark); err != nil {
 		// TODO(rdaoud): quarantine this database.
 		return false
 	}
 	return true
 }

 // processWatchLogBatch parses the given batch of watch log records, updates the
 // watchable syncgroup prefixes, uses the prefixes to filter the batch to the
 // subset of syncable records, and transactionally applies these updates to the
 // sync metadata (DAG & log records) and updates the watch resume marker.
 func (s *syncService) processWatchLogBatch(ctx *context.T, dbId wire.Id, st store.Store, logs []*watchable.LogEntry, resMark watch.ResumeMarker) error {
 	if len(logs) == 0 {
 		return nil
 	}

 	if processDbStateChangeLogRecord(ctx, s, st, dbId, logs[0], resMark) {
 		// A batch containing DbStateChange will not have any more records.
 		// This batch is done processing.
 		return nil
 	}

 	// If the first log entry is a syncgroup prefix operation, then this is
 	// a syncgroup snapshot and not an application batch.  In this case,
 	// handle the syncgroup prefix changes by updating the watch prefixes
 	// and exclude the first entry from the batch.  Also inform the batch
 	// processing below to not assign it a batch ID in the DAG.
 	sgop, err := processSyncgroupLogRecord(dbId, logs)
 	if err != nil {
 		vlog.Errorf("sync: processWatchLogBatch: %v: bad log entry: %v", dbId, err)
 		return err
 	}

 	appBatch := true
 	if sgop != nil {
 		appBatch = false
 	}

 	// Preprocess the log entries before calling RunInTransaction():
 	// - Filter out log entries made by sync (echo suppression).
 	// - Convert the log entries to initial sync log records that contain
 	//   the metadata from the log entries.  These records will be later
 	//   augmented with DAG information inside RunInTransaction().
 	// - Determine which entries are syncable (included in some syncgroup)
 	//   and ignore the private ones (not currently shared in a syncgroup).
 	// - Extract blob refs from syncable values and update blob metadata.
 	//
 	// These steps are idempotent.  If Syncbase crashes before completing
 	// the transaction below (which updates the resume marker), these steps
 	// are re-executed.
 	totalCount := uint64(len(logs))
 	batch := make([]*LocalLogRec, 0, totalCount)

 	for _, e := range logs {
 		if !e.FromSync {
 			if rec, err := convertLogRecord(ctx, s.id, e); err != nil {
 				vlog.Errorf("sync: processWatchLogBatch: %v: bad entry: %v: %v", dbId, *e, err)
 				return err
 			} else if rec != nil {
 				if syncable(dbId, rec.Metadata.ObjId) {
 					batch = append(batch, rec)
 				}
 			}
 		}
 	}

 	vlog.VI(3).Infof("sync: processWatchLogBatch: %v: sg snap %t, syncable %d, total %d", dbId, !appBatch, len(batch), totalCount)

 	if err := s.processWatchBlobRefs(ctx, dbId, st, batch); err != nil {
 		// There may be an error here if the database is recently
 		// destroyed.  Ignore the error and continue to another database.
 		vlog.Errorf("sync: processWatchLogBatch: %v: watcher cannot process blob refs: %v", dbId, err)
 		return nil
 	}

 	// Transactional processing of the batch: Fixup syncable log records to
 	// augment them with DAG information.
 	err = store.RunInTransaction(st, func(tx store.Transaction) error {
 		txBatch, err := fixupLogRecordBatch(ctx, tx, batch, appBatch)
 		if err != nil {
 			return err
 		}

 		if err := s.processBatch(ctx, dbId, txBatch, appBatch, totalCount, tx); err != nil {
 			return err
 		}

 		if !appBatch {
 			if err := setSyncgroupWatchable(ctx, tx, sgop); err != nil {
 				return err
 			}
 		}

 		return setResMark(ctx, tx, resMark)
 	})

 	if err != nil {
 		// There may be an error here if the database is recently
 		// destroyed. Ignore the error and continue to another database.
 		// TODO(rdaoud): quarantine this database for other errors.
 		vlog.Errorf("sync: processWatchLogBatch: %v: watcher cannot process batch: %v", dbId, err)
 	}
 	return nil
 }

 // processBatch applies a single batch of changes (object mutations) received
 // from watching a particular Database.
 func (s *syncService) processBatch(ctx *context.T, dbId wire.Id, batch []*LocalLogRec, appBatch bool, totalCount uint64, tx store.Transaction) error {
 	count := uint64(len(batch))
 	if count == 0 {
 		return nil
 	}

 	// If an application batch has more than one mutation, start a batch for it.
 	batchId := NoBatchId
 	if appBatch && totalCount > 1 {
 		batchId = s.startBatch(ctx, tx, batchId)
 		if batchId == NoBatchId {
 			return verror.New(verror.ErrInternal, ctx, "failed to generate batch ID")
 		}
 	}

 	gen, pos := s.reserveGenAndPosInDbLog(ctx, dbId, "", count)

 	vlog.VI(3).Infof("sync: processBatch: %v: len %d, total %d, btid %x, gen %d, pos %d", dbId, count, totalCount, batchId, gen, pos)

 	for _, rec := range batch {
 		// Update the log record. Portions of the record Metadata must
 		// already be filled.
 		rec.Metadata.Gen = gen

 		rec.Metadata.BatchId = batchId
 		rec.Metadata.BatchCount = totalCount

 		rec.Pos = pos

 		gen++
 		pos++

 		if err := s.processLocalLogRec(ctx, tx, rec); err != nil {
 			return verror.New(verror.ErrInternal, ctx, err)
 		}
 	}

 	// End the batch if any.
 	if batchId != NoBatchId {
 		if err := s.endBatch(ctx, tx, batchId, totalCount); err != nil {
 			return err
 		}
 	}

 	return nil
 }

 // processLocalLogRec processes a local log record by adding to the Database and
 // suitably updating the DAG metadata.
 func (s *syncService) processLocalLogRec(ctx *context.T, tx store.Transaction, rec *LocalLogRec) error {
 	// Insert the new log record into the log.
 	if err := putLogRec(ctx, tx, logDataPrefix, rec); err != nil {
 		return err
 	}

 	m := &rec.Metadata
 	logKey := logRecKey(logDataPrefix, m.Id, m.Gen)

 	// Insert the new log record into dag.
 	if err := s.addNode(ctx, tx, m.ObjId, m.CurVers, logKey, m.Delete, m.Parents, m.BatchId, nil); err != nil {
 		return err
 	}

 	// Move the head.
 	return moveHead(ctx, tx, m.ObjId, m.CurVers)
 }

 // processWatchBlobRefs extracts blob refs from the data values of the updates
 // received in the watch batch and updates the blob-to-syncgroup metadata.
 func (s *syncService) processWatchBlobRefs(ctx *context.T, dbId wire.Id, st store.Store, batch []*LocalLogRec) error {
 	if len(batch) == 0 {
 		return nil
 	}

 	sgPfxs := watchPrefixes[dbId]
 	if len(sgPfxs) == 0 {
 		return verror.New(verror.ErrInternal, ctx, "processWatchBlobRefs: no sg prefixes in db", dbId)
 	}

 	for _, rec := range batch {
 		m := &rec.Metadata
 		if m.Delete {
 			continue
 		}

 		buf, err := watchable.GetAtVersion(ctx, st, []byte(m.ObjId), nil, []byte(m.CurVers))
 		if err != nil {
 			return err
 		}

 		if err = s.processBlobRefs(ctx, dbId, st, s.name, true, sgPfxs, nil, m, buf); err != nil {
 			return err
 		}
 	}
 	return nil
 }

 // addWatchPrefixSyncgroup adds a syncgroup prefix-to-ID mapping for an app
 // database in the watch prefix cache.
 func addWatchPrefixSyncgroup(dbId wire.Id, prefix string, gid interfaces.GroupId) {
 	if pfxs := watchPrefixes[dbId]; pfxs != nil {
 		if sgs := pfxs[prefix]; sgs != nil {
 			sgs[gid] = struct{}{}
 		} else {
 			pfxs[prefix] = sgSet{gid: struct{}{}}
 		}
 	} else {
 		watchPrefixes[dbId] = sgPrefixes{prefix: sgSet{gid: struct{}{}}}
 	}
 }

 // rmWatchPrefixSyncgroup removes a syncgroup prefix-to-ID mapping for an app
 // database in the watch prefix cache.
 func rmWatchPrefixSyncgroup(dbId wire.Id, prefix string, gid interfaces.GroupId) {
 	if pfxs := watchPrefixes[dbId]; pfxs != nil {
 		if sgs := pfxs[prefix]; sgs != nil {
 			delete(sgs, gid)
 			if len(sgs) == 0 {
 				delete(pfxs, prefix)
 				if len(pfxs) == 0 {
 					delete(watchPrefixes, dbId)
 				}
 			}
 		}
 	}
 }

 // setSyncgroupWatchable sets the local watchable state of the syncgroup.
 func setSyncgroupWatchable(ctx *context.T, tx store.Transaction, sgop *sbwatchable.SyncgroupOp) error {
 	state, err := getSGIdEntry(ctx, tx, sgop.SgId)
 	if err != nil {
 		return err
 	}
 	state.Watched = !sgop.Remove

 	return setSGIdEntry(ctx, tx, sgop.SgId, state)
 }

 // convertLogRecord converts a store log entry to an initial sync log record
 // that contains metadata from the store log entry and the Syncbase ID, but no
 // information from the DAG which is filled later within the scope of a store
 // transaction.  For a delete, it generates a new object version because the
 // store does not version a deletion.
 // TODO(rdaoud): change Syncbase to store and version a deleted object to
 // simplify the store-to-sync interaction.  A deleted key would still have a
 // version and its value entry would encode the "deleted" flag, either in the
 // key or probably in a value wrapper that would contain other metadata.
 func convertLogRecord(ctx *context.T, syncId uint64, logEnt *watchable.LogEntry) (*LocalLogRec, error) {
 	var op interface{}
 	if err := logEnt.Op.ToValue(&op); err != nil {
 		return nil, err
 	}

 	var key, version string
 	deleted := false
 	timestamp := logEnt.CommitTimestamp

 	switch op := op.(type) {
 	case *watchable.PutOp:
 		key, version = string(op.Key), string(op.Version)

 	case *sbwatchable.SyncSnapshotOp:
 		key, version = string(op.Key), string(op.Version)

 	case *watchable.DeleteOp:
 		key, version, deleted = string(op.Key), string(watchable.NewVersion()), true

 	case *watchable.GetOp:
 		// TODO(rdaoud): save read-set in sync.
 		return nil, nil

 	case *watchable.ScanOp:
 		// TODO(rdaoud): save scan-set in sync.
 		return nil, nil

 	case *sbwatchable.SyncgroupOp:
 		// We can ignore the syncgroup op.
 		return nil, nil

 	default:
 		return nil, verror.New(verror.ErrInternal, ctx, "cannot convert unknown watch log entry")
 	}

 	rec := &LocalLogRec{
 		Metadata: interfaces.LogRecMetadata{
 			ObjId:   key,
 			CurVers: version,
 			Delete:  deleted,
 			UpdTime: unixNanoToTime(timestamp),
 			Id:      syncId,
 			RecType: interfaces.NodeRec,
 		},
 	}
 	return rec, nil
 }

 // fixupLogRecordBatch updates the sync log records in a batch with information
 // retrieved from the DAG within the scope of a store transaction and returns
 // an updated batch.  This allows the transaction to track these data fetches in
 // its read-set, which is required for the proper handling of optimistic locking
 // between store transactions.  If the input batch is not an application batch
 // (e.g. one generated by a syncgroup snapshot), the returned batch may include
 // fewer log records because duplicates were filtered out.
 func fixupLogRecordBatch(ctx *context.T, tx store.Transaction, batch []*LocalLogRec, appBatch bool) ([]*LocalLogRec, error) {
 	newBatch := make([]*LocalLogRec, 0, len(batch))

 	for _, rec := range batch {
 		m := &rec.Metadata

 		// Check if this object version already exists in the DAG.  This
 		// is only allowed for non-application batches and can happen in
 		// the cases of syncgroup initialization snapshots for nested or
 		// peer syncgroups creating duplicate log records that are
 		// skipped here.  Otherwise this is an error.
 		if ok, err := hasNode(ctx, tx, m.ObjId, m.CurVers); err != nil {
 			return nil, err
 		} else if ok {
 			if appBatch {
 				return nil, verror.New(verror.ErrInternal, ctx, "duplicate log record", m.ObjId, m.CurVers)
 			}
 		} else {
 			// Set the current DAG head as the parent of this update.
 			m.Parents = nil
 			if head, err := getHead(ctx, tx, m.ObjId); err == nil {
 				m.Parents = []string{head}
 			} else if m.Delete || (verror.ErrorID(err) != verror.ErrNoExist.ID) {
 				return nil, verror.New(verror.ErrInternal, ctx, "cannot getHead to fixup log record", m.ObjId)
 			}

 			newBatch = append(newBatch, rec)
 		}
 	}

 	return newBatch, nil
 }

 // processDbStateChangeLogRecord checks if the log entry is a
 // DbStateChangeRequest and if so, it executes the state change request
 // appropriately.
 // TODO(razvanm): change the return type to error.
 func processDbStateChangeLogRecord(ctx *context.T, s *syncService, st store.Store, dbId wire.Id, logEnt *watchable.LogEntry, resMark watch.ResumeMarker) bool {
 	var op interface{}
 	if err := logEnt.Op.ToValue(&op); err != nil {
 		vlog.Fatalf("sync: processDbStateChangeLogRecord: %v: bad VOM: %v", dbId, err)
 	}
 	switch op := op.(type) {
 	case *sbwatchable.DbStateChangeRequestOp:
 		dbStateChangeRequest := op
 		vlog.VI(1).Infof("sync: processDbStateChangeLogRecord: found a dbState change log record with state %#v", dbStateChangeRequest)
 		isPaused := false
 		if err := store.RunInTransaction(st, func(tx store.Transaction) error {
 			switch dbStateChangeRequest.RequestType {
 			case sbwatchable.StateChangePauseSync:
 				vlog.VI(1).Infof("sync: processDbStateChangeLogRecord: PauseSync request found. Pausing sync.")
 				isPaused = true
 			case sbwatchable.StateChangeResumeSync:
 				vlog.VI(1).Infof("sync: processDbStateChangeLogRecord: ResumeSync request found. Resuming sync.")
 				isPaused = false
 			default:
 				return fmt.Errorf("Unexpected DbStateChangeRequest found: %#v", dbStateChangeRequest)
 			}
 			// Update isPaused state in db.
 			if err := s.updateDbPauseSt(ctx, tx, dbId, isPaused); err != nil {
 				return err
 			}
 			return setResMark(ctx, tx, resMark)
 		}); err != nil {
 			// TODO(rdaoud): don't crash, quarantine this database.
 			vlog.Fatalf("sync: processDbStateChangeLogRecord: %v: watcher failed to reset dbState bits: %v", dbId, err)
 		}
 		// Update isPaused state in cache.
 		s.updateInMemoryPauseSt(ctx, dbId, isPaused)
 		return true

 	default:
 		return false
 	}
 }

 // processSyncgroupLogRecord checks if the log entries contain a syncgroup
 // update and, if they do, updates the watch prefixes for the database and
 // returns a syncgroup operation. Otherwise it returns a nil operation with no
 // other changes.
 func processSyncgroupLogRecord(dbId wire.Id, logs []*watchable.LogEntry) (*sbwatchable.SyncgroupOp, error) {
 	for _, logEnt := range logs {
 		var op interface{}

 		if err := logEnt.Op.ToValue(&op); err != nil {
 			return nil, err
 		}

 		switch op := op.(type) {
 		case *sbwatchable.SyncgroupOp:
 			gid, remove := op.SgId, op.Remove
 			for _, prefix := range op.Prefixes {
 				if remove {
 					rmWatchPrefixSyncgroup(dbId, prefix, gid)
 				} else {
 					addWatchPrefixSyncgroup(dbId, prefix, gid)
 				}
 			}
 			vlog.VI(3).Infof("sync: processSyncgroupLogRecord: %v: gid %s, remove %t, prefixes: %q", dbId, gid, remove, op.Prefixes)
 			return op, nil
 		}
 	}
 	return nil, nil
 }

 // syncable returns true if the given key falls within the scope of a syncgroup
 // prefix for the given database, and thus should be synced.
 func syncable(dbId wire.Id, key string) bool {
 	// The key starts with one of the store's reserved prefixes for managed
 	// namespaces (e.g. "r" for row, "c" for collection perms). Remove that
 	// prefix before comparing it with the syncgroup prefixes.
 	key = common.StripFirstKeyPartOrDie(key)

 	for prefix := range watchPrefixes[dbId] {
 		if strings.HasPrefix(key, prefix) {
 			return true
 		}
 	}
 	return false
 }

 // resMarkKey returns the key used to access the watcher resume marker.
 func resMarkKey() string {
 	return common.JoinKeyParts(common.SyncPrefix, "w", "rm")
 }

 // setResMark stores the watcher resume marker for a database.
 func setResMark(ctx *context.T, tx store.Transaction, resMark watch.ResumeMarker) error {
 	return store.Put(ctx, tx, resMarkKey(), resMark)
 }

 // getResMark retrieves the watcher resume marker for a database.
 func getResMark(ctx *context.T, st store.StoreReader) (watch.ResumeMarker, error) {
 	var resMark watch.ResumeMarker
 	key := resMarkKey()
 	if err := store.Get(ctx, st, key, &resMark); err != nil {
 		return nil, err
 	}
 	return resMark, nil
 }
	// Copyright 2015 The Vanadium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package vsync

	// Syncbase Watcher is a goroutine that listens to local Database updates from
	// applications and modifies sync metadata (e.g. DAG and local log records).
	// The coupling between Syncbase storage and sync is loose, via asynchronous
	// listening by the Watcher, to unblock the application operations as soon as
	// possible, and offload the sync metadata update to the Watcher. When the
	// application mutates objects in a Database, additional entries are written
	// to a log queue, persisted in the same Database. This queue is read by the
	// sync Watcher to learn of the changes.

	import (
	"fmt"
	"strings"

	"v.io/v23/context"
	wire "v.io/v23/services/syncbase"
	"v.io/v23/services/watch"
	"v.io/v23/verror"
	"v.io/x/lib/vlog"
	"v.io/x/ref/services/syncbase/common"
	"v.io/x/ref/services/syncbase/server/interfaces"
	"v.io/x/ref/services/syncbase/store"
	"v.io/x/ref/services/syncbase/store/watchable"
	sbwatchable "v.io/x/ref/services/syncbase/watchable"
	)

	var (
	// watchPrefixes is an in-memory cache of syncgroup prefixes for each
	// database. It is filled at startup from persisted syncgroup data
	// and updated at runtime when syncgroups are joined or left. It is
	// not guarded by a mutex because only the watcher goroutine uses it
	// beyond the startup phase (before any sync goroutines are started).
	// The map keys are the database ids (globally unique).
	watchPrefixes = make(map[wire.Id]sgPrefixes)
	)

	// sgPrefixes tracks syncgroup prefixes being synced in a database and their
	// syncgroups.
	type sgPrefixes map[string]sgSet

	// StartStoreWatcher starts a Sync goroutine to watch the database store for
	// updates and process them into the Sync subsystem. This function is called
	// when a database is created or reopened during service restart, thus spawning
	// a Sync watcher for each database.
	func (sd syncDatabase) StartStoreWatcher(ctx context.T) {
	s := sd.sync.(*syncService)
	dbId, st := sd.db.Id(), sd.db.St()

	s.pending.Add(1)
	ctx, cancel := context.WithCancel(ctx)

	go func() {
	s.watchStore(ctx, dbId, st)
	cancel()
	s.pending.Done()
	}()
	}

	// watchStore processes updates obtained by watching the store. This is the
	// sync watcher goroutine that learns about store updates asynchronously by
	// reading log records that track object mutation histories in each database.
	// For each batch mutation, the watcher updates the sync DAG and log records.
	// When an application makes a single non-transactional put, it is represented
	// as a batch of one log record. Thus the watcher only deals with batches.
	func (s syncService) watchStore(ctx context.T, dbId wire.Id, st *watchable.Store) {
	vlog.VI(1).Infof("sync: watchStore: DB %v: start watching updates", dbId)

	updatesChan, cancel := watchable.WatchUpdates(st)
	defer cancel()

	moreWork := true
	for moreWork && !s.isClosed() {
	if s.processDatabase(ctx, dbId, st) {
	vlog.VI(2).Infof("sync: watchStore: DB %v: had updates", dbId)
	} else {
	vlog.VI(2).Infof("sync: watchStore: DB %v: idle, wait for updates", dbId)
	select {
	case _, moreWork = <-updatesChan:

	case <-s.closed:
	moreWork = false
	}
	}
	}

	vlog.VI(1).Infof("sync: watchStore: DB %v: channel closed, stop watching and exit", dbId)
	}

	// processDatabase fetches from the given database at most one new batch update
	// (transaction) and processes it. A batch is stored as a contiguous set of log
	// records ending with one record having the "continued" flag set to false. The
	// call returns true if a new batch update was processed.
	func (s syncService) processDatabase(ctx context.T, dbId wire.Id, st store.Store) bool {
	s.thLock.Lock()
	defer s.thLock.Unlock()

	vlog.VI(2).Infof("sync: processDatabase: begin: %v", dbId)
	defer vlog.VI(2).Infof("sync: processDatabase: end: %v", dbId)

	resMark, err := getResMark(ctx, st)
	if err != nil {
	if verror.ErrorID(err) != verror.ErrNoExist.ID {
	vlog.Errorf("sync: processDatabase: %v: cannot get resMark: %v", dbId, err)
	return false
	}
	resMark = watchable.MakeResumeMarker(0)
	}

	// Initialize Database sync state if needed.
	s.initSyncStateInMem(ctx, dbId, "")

	// Get a batch of watch log entries, if any, after this resume marker.
	logs, nextResmark, err := watchable.ReadBatchFromLog(st, resMark)
	if err != nil {
	// An error here (scan stream cancelled) is possible when the watcher is in
	// the middle of processing a database when it is destroyed. Hence, we just
	// ignore this database and proceed.
	vlog.Errorf("sync: processDatabase: %v: cannot get watch log batch: %v", dbId, verror.DebugString(err))
	return false
	} else if logs == nil {
	return false
	}

	if err = s.processWatchLogBatch(ctx, dbId, st, logs, nextResmark); err != nil {
	// TODO(rdaoud): quarantine this database.
	return false
	}
	return true
	}

	// processWatchLogBatch parses the given batch of watch log records, updates the
	// watchable syncgroup prefixes, uses the prefixes to filter the batch to the
	// subset of syncable records, and transactionally applies these updates to the
	// sync metadata (DAG & log records) and updates the watch resume marker.
	func (s syncService) processWatchLogBatch(ctx context.T, dbId wire.Id, st store.Store, logs []*watchable.LogEntry, resMark watch.ResumeMarker) error {
	if len(logs) == 0 {
	return nil
	}

	if processDbStateChangeLogRecord(ctx, s, st, dbId, logs[0], resMark) {
	// A batch containing DbStateChange will not have any more records.
	// This batch is done processing.
	return nil
	}

	// If the first log entry is a syncgroup prefix operation, then this is
	// a syncgroup snapshot and not an application batch. In this case,
	// handle the syncgroup prefix changes by updating the watch prefixes
	// and exclude the first entry from the batch. Also inform the batch
	// processing below to not assign it a batch ID in the DAG.
	sgop, err := processSyncgroupLogRecord(dbId, logs)
	if err != nil {
	vlog.Errorf("sync: processWatchLogBatch: %v: bad log entry: %v", dbId, err)
	return err
	}

	appBatch := true
	if sgop != nil {
	appBatch = false
	}

	// Preprocess the log entries before calling RunInTransaction():
	// - Filter out log entries made by sync (echo suppression).
	// - Convert the log entries to initial sync log records that contain
	// the metadata from the log entries. These records will be later
	// augmented with DAG information inside RunInTransaction().
	// - Determine which entries are syncable (included in some syncgroup)
	// and ignore the private ones (not currently shared in a syncgroup).
	// - Extract blob refs from syncable values and update blob metadata.
	//
	// These steps are idempotent. If Syncbase crashes before completing
	// the transaction below (which updates the resume marker), these steps
	// are re-executed.
	totalCount := uint64(len(logs))
	batch := make([]*LocalLogRec, 0, totalCount)

	for _, e := range logs {
	if !e.FromSync {
	if rec, err := convertLogRecord(ctx, s.id, e); err != nil {
	vlog.Errorf("sync: processWatchLogBatch: %v: bad entry: %v: %v", dbId, *e, err)
	return err
	} else if rec != nil {
	if syncable(dbId, rec.Metadata.ObjId) {
	batch = append(batch, rec)
	}
	}
	}
	}

	vlog.VI(3).Infof("sync: processWatchLogBatch: %v: sg snap %t, syncable %d, total %d", dbId, !appBatch, len(batch), totalCount)

	if err := s.processWatchBlobRefs(ctx, dbId, st, batch); err != nil {
	// There may be an error here if the database is recently
	// destroyed. Ignore the error and continue to another database.
	vlog.Errorf("sync: processWatchLogBatch: %v: watcher cannot process blob refs: %v", dbId, err)
	return nil
	}

	// Transactional processing of the batch: Fixup syncable log records to
	// augment them with DAG information.
	err = store.RunInTransaction(st, func(tx store.Transaction) error {
	txBatch, err := fixupLogRecordBatch(ctx, tx, batch, appBatch)
	if err != nil {
	return err
	}

	if err := s.processBatch(ctx, dbId, txBatch, appBatch, totalCount, tx); err != nil {
	return err
	}

	if !appBatch {
	if err := setSyncgroupWatchable(ctx, tx, sgop); err != nil {
	return err
	}
	}

	return setResMark(ctx, tx, resMark)
	})

	if err != nil {
	// There may be an error here if the database is recently
	// destroyed. Ignore the error and continue to another database.
	// TODO(rdaoud): quarantine this database for other errors.
	vlog.Errorf("sync: processWatchLogBatch: %v: watcher cannot process batch: %v", dbId, err)
	}
	return nil
	}

	// processBatch applies a single batch of changes (object mutations) received
	// from watching a particular Database.
	func (s syncService) processBatch(ctx context.T, dbId wire.Id, batch []*LocalLogRec, appBatch bool, totalCount uint64, tx store.Transaction) error {
	count := uint64(len(batch))
	if count == 0 {
	return nil
	}

	// If an application batch has more than one mutation, start a batch for it.
	batchId := NoBatchId
	if appBatch && totalCount > 1 {
	batchId = s.startBatch(ctx, tx, batchId)
	if batchId == NoBatchId {
	return verror.New(verror.ErrInternal, ctx, "failed to generate batch ID")
	}
	}

	gen, pos := s.reserveGenAndPosInDbLog(ctx, dbId, "", count)

	vlog.VI(3).Infof("sync: processBatch: %v: len %d, total %d, btid %x, gen %d, pos %d", dbId, count, totalCount, batchId, gen, pos)

	for _, rec := range batch {
	// Update the log record. Portions of the record Metadata must
	// already be filled.
	rec.Metadata.Gen = gen

	rec.Metadata.BatchId = batchId
	rec.Metadata.BatchCount = totalCount

	rec.Pos = pos

	gen++
	pos++

	if err := s.processLocalLogRec(ctx, tx, rec); err != nil {
	return verror.New(verror.ErrInternal, ctx, err)
	}
	}

	// End the batch if any.
	if batchId != NoBatchId {
	if err := s.endBatch(ctx, tx, batchId, totalCount); err != nil {
	return err
	}
	}

	return nil
	}

	// processLocalLogRec processes a local log record by adding to the Database and
	// suitably updating the DAG metadata.
	func (s syncService) processLocalLogRec(ctx context.T, tx store.Transaction, rec *LocalLogRec) error {
	// Insert the new log record into the log.
	if err := putLogRec(ctx, tx, logDataPrefix, rec); err != nil {
	return err
	}

	m := &rec.Metadata
	logKey := logRecKey(logDataPrefix, m.Id, m.Gen)

	// Insert the new log record into dag.
	if err := s.addNode(ctx, tx, m.ObjId, m.CurVers, logKey, m.Delete, m.Parents, m.BatchId, nil); err != nil {
	return err
	}

	// Move the head.
	return moveHead(ctx, tx, m.ObjId, m.CurVers)
	}

	// processWatchBlobRefs extracts blob refs from the data values of the updates
	// received in the watch batch and updates the blob-to-syncgroup metadata.
	func (s syncService) processWatchBlobRefs(ctx context.T, dbId wire.Id, st store.Store, batch []*LocalLogRec) error {
	if len(batch) == 0 {
	return nil
	}

	sgPfxs := watchPrefixes[dbId]
	if len(sgPfxs) == 0 {
	return verror.New(verror.ErrInternal, ctx, "processWatchBlobRefs: no sg prefixes in db", dbId)
	}

	for _, rec := range batch {
	m := &rec.Metadata
	if m.Delete {
	continue
	}

	buf, err := watchable.GetAtVersion(ctx, st, []byte(m.ObjId), nil, []byte(m.CurVers))
	if err != nil {
	return err
	}

	if err = s.processBlobRefs(ctx, dbId, st, s.name, true, sgPfxs, nil, m, buf); err != nil {
	return err
	}
	}
	return nil
	}

	// addWatchPrefixSyncgroup adds a syncgroup prefix-to-ID mapping for an app
	// database in the watch prefix cache.
	func addWatchPrefixSyncgroup(dbId wire.Id, prefix string, gid interfaces.GroupId) {
	if pfxs := watchPrefixes[dbId]; pfxs != nil {
	if sgs := pfxs[prefix]; sgs != nil {
	sgs[gid] = struct{}{}
	} else {
	pfxs[prefix] = sgSet{gid: struct{}{}}
	}
	} else {
	watchPrefixes[dbId] = sgPrefixes{prefix: sgSet{gid: struct{}{}}}
	}
	}

	// rmWatchPrefixSyncgroup removes a syncgroup prefix-to-ID mapping for an app
	// database in the watch prefix cache.
	func rmWatchPrefixSyncgroup(dbId wire.Id, prefix string, gid interfaces.GroupId) {
	if pfxs := watchPrefixes[dbId]; pfxs != nil {
	if sgs := pfxs[prefix]; sgs != nil {
	delete(sgs, gid)
	if len(sgs) == 0 {
	delete(pfxs, prefix)
	if len(pfxs) == 0 {
	delete(watchPrefixes, dbId)
	}
	}
	}
	}
	}

	// setSyncgroupWatchable sets the local watchable state of the syncgroup.
	func setSyncgroupWatchable(ctx context.T, tx store.Transaction, sgop sbwatchable.SyncgroupOp) error {
	state, err := getSGIdEntry(ctx, tx, sgop.SgId)
	if err != nil {
	return err
	}
	state.Watched = !sgop.Remove

	return setSGIdEntry(ctx, tx, sgop.SgId, state)
	}

	// convertLogRecord converts a store log entry to an initial sync log record
	// that contains metadata from the store log entry and the Syncbase ID, but no
	// information from the DAG which is filled later within the scope of a store
	// transaction. For a delete, it generates a new object version because the
	// store does not version a deletion.
	// TODO(rdaoud): change Syncbase to store and version a deleted object to
	// simplify the store-to-sync interaction. A deleted key would still have a
	// version and its value entry would encode the "deleted" flag, either in the
	// key or probably in a value wrapper that would contain other metadata.
	func convertLogRecord(ctx context.T, syncId uint64, logEnt watchable.LogEntry) (*LocalLogRec, error) {
	var op interface{}
	if err := logEnt.Op.ToValue(&op); err != nil {
	return nil, err
	}

	var key, version string
	deleted := false
	timestamp := logEnt.CommitTimestamp

	switch op := op.(type) {
	case *watchable.PutOp:
	key, version = string(op.Key), string(op.Version)

	case *sbwatchable.SyncSnapshotOp:
	key, version = string(op.Key), string(op.Version)

	case *watchable.DeleteOp:
	key, version, deleted = string(op.Key), string(watchable.NewVersion()), true

	case *watchable.GetOp:
	// TODO(rdaoud): save read-set in sync.
	return nil, nil

	case *watchable.ScanOp:
	// TODO(rdaoud): save scan-set in sync.
	return nil, nil

	case *sbwatchable.SyncgroupOp:
	// We can ignore the syncgroup op.
	return nil, nil

	default:
	return nil, verror.New(verror.ErrInternal, ctx, "cannot convert unknown watch log entry")
	}

	rec := &LocalLogRec{
	Metadata: interfaces.LogRecMetadata{
	ObjId: key,
	CurVers: version,
	Delete: deleted,
	UpdTime: unixNanoToTime(timestamp),
	Id: syncId,
	RecType: interfaces.NodeRec,
	},
	}
	return rec, nil
	}

	// fixupLogRecordBatch updates the sync log records in a batch with information
	// retrieved from the DAG within the scope of a store transaction and returns
	// an updated batch. This allows the transaction to track these data fetches in
	// its read-set, which is required for the proper handling of optimistic locking
	// between store transactions. If the input batch is not an application batch
	// (e.g. one generated by a syncgroup snapshot), the returned batch may include
	// fewer log records because duplicates were filtered out.
	func fixupLogRecordBatch(ctx context.T, tx store.Transaction, batch []LocalLogRec, appBatch bool) ([]*LocalLogRec, error) {
	newBatch := make([]*LocalLogRec, 0, len(batch))

	for _, rec := range batch {
	m := &rec.Metadata

	// Check if this object version already exists in the DAG. This
	// is only allowed for non-application batches and can happen in
	// the cases of syncgroup initialization snapshots for nested or
	// peer syncgroups creating duplicate log records that are
	// skipped here. Otherwise this is an error.
	if ok, err := hasNode(ctx, tx, m.ObjId, m.CurVers); err != nil {
	return nil, err
	} else if ok {
	if appBatch {
	return nil, verror.New(verror.ErrInternal, ctx, "duplicate log record", m.ObjId, m.CurVers)
	}
	} else {
	// Set the current DAG head as the parent of this update.
	m.Parents = nil
	if head, err := getHead(ctx, tx, m.ObjId); err == nil {
	m.Parents = []string{head}
	} else if m.Delete \|\| (verror.ErrorID(err) != verror.ErrNoExist.ID) {
	return nil, verror.New(verror.ErrInternal, ctx, "cannot getHead to fixup log record", m.ObjId)
	}

	newBatch = append(newBatch, rec)
	}
	}

	return newBatch, nil
	}

	// processDbStateChangeLogRecord checks if the log entry is a
	// DbStateChangeRequest and if so, it executes the state change request
	// appropriately.
	// TODO(razvanm): change the return type to error.
	func processDbStateChangeLogRecord(ctx context.T, s syncService, st store.Store, dbId wire.Id, logEnt *watchable.LogEntry, resMark watch.ResumeMarker) bool {
	var op interface{}
	if err := logEnt.Op.ToValue(&op); err != nil {
	vlog.Fatalf("sync: processDbStateChangeLogRecord: %v: bad VOM: %v", dbId, err)
	}
	switch op := op.(type) {
	case *sbwatchable.DbStateChangeRequestOp:
	dbStateChangeRequest := op
	vlog.VI(1).Infof("sync: processDbStateChangeLogRecord: found a dbState change log record with state %#v", dbStateChangeRequest)
	isPaused := false
	if err := store.RunInTransaction(st, func(tx store.Transaction) error {
	switch dbStateChangeRequest.RequestType {
	case sbwatchable.StateChangePauseSync:
	vlog.VI(1).Infof("sync: processDbStateChangeLogRecord: PauseSync request found. Pausing sync.")
	isPaused = true
	case sbwatchable.StateChangeResumeSync:
	vlog.VI(1).Infof("sync: processDbStateChangeLogRecord: ResumeSync request found. Resuming sync.")
	isPaused = false
	default:
	return fmt.Errorf("Unexpected DbStateChangeRequest found: %#v", dbStateChangeRequest)
	}
	// Update isPaused state in db.
	if err := s.updateDbPauseSt(ctx, tx, dbId, isPaused); err != nil {
	return err
	}
	return setResMark(ctx, tx, resMark)
	}); err != nil {
	// TODO(rdaoud): don't crash, quarantine this database.
	vlog.Fatalf("sync: processDbStateChangeLogRecord: %v: watcher failed to reset dbState bits: %v", dbId, err)
	}
	// Update isPaused state in cache.
	s.updateInMemoryPauseSt(ctx, dbId, isPaused)
	return true

	default:
	return false
	}
	}

	// processSyncgroupLogRecord checks if the log entries contain a syncgroup
	// update and, if they do, updates the watch prefixes for the database and
	// returns a syncgroup operation. Otherwise it returns a nil operation with no
	// other changes.
	func processSyncgroupLogRecord(dbId wire.Id, logs []watchable.LogEntry) (sbwatchable.SyncgroupOp, error) {
	for _, logEnt := range logs {
	var op interface{}

	if err := logEnt.Op.ToValue(&op); err != nil {
	return nil, err
	}

	switch op := op.(type) {
	case *sbwatchable.SyncgroupOp:
	gid, remove := op.SgId, op.Remove
	for _, prefix := range op.Prefixes {
	if remove {
	rmWatchPrefixSyncgroup(dbId, prefix, gid)
	} else {
	addWatchPrefixSyncgroup(dbId, prefix, gid)
	}
	}
	vlog.VI(3).Infof("sync: processSyncgroupLogRecord: %v: gid %s, remove %t, prefixes: %q", dbId, gid, remove, op.Prefixes)
	return op, nil
	}
	}
	return nil, nil
	}

	// syncable returns true if the given key falls within the scope of a syncgroup
	// prefix for the given database, and thus should be synced.
	func syncable(dbId wire.Id, key string) bool {
	// The key starts with one of the store's reserved prefixes for managed
	// namespaces (e.g. "r" for row, "c" for collection perms). Remove that
	// prefix before comparing it with the syncgroup prefixes.
	key = common.StripFirstKeyPartOrDie(key)

	for prefix := range watchPrefixes[dbId] {
	if strings.HasPrefix(key, prefix) {
	return true
	}
	}
	return false
	}

	// resMarkKey returns the key used to access the watcher resume marker.
	func resMarkKey() string {
	return common.JoinKeyParts(common.SyncPrefix, "w", "rm")
	}

	// setResMark stores the watcher resume marker for a database.
	func setResMark(ctx *context.T, tx store.Transaction, resMark watch.ResumeMarker) error {
	return store.Put(ctx, tx, resMarkKey(), resMark)
	}

	// getResMark retrieves the watcher resume marker for a database.
	func getResMark(ctx *context.T, st store.StoreReader) (watch.ResumeMarker, error) {
	var resMark watch.ResumeMarker
	key := resMarkKey()
	if err := store.Get(ctx, st, key, &resMark); err != nil {
	return nil, err
	}
	return resMark, nil
	}