services/syncbase/vsync/watcher.go - release.go.x.ref - Git at Google

 // Copyright 2015 The Vanadium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package vsync

 // Syncbase Watcher is a goroutine that listens to local Database updates from
 // applications and modifies sync metadata (e.g. DAG and local log records).
 // The coupling between Syncbase storage and sync is loose, via asynchronous
 // listening by the Watcher, to unblock the application operations as soon as
 // possible, and offload the sync metadata update to the Watcher.  When the
 // application mutates objects in a Database, additional entries are written
 // to a log queue, persisted in the same Database.  This queue is read by the
 // sync Watcher to learn of the changes.

 import (
 	"fmt"
 	"strings"
 	"time"

 	"v.io/v23/context"
 	wire "v.io/v23/services/syncbase"
 	"v.io/v23/services/watch"
 	"v.io/v23/verror"
 	"v.io/x/lib/vlog"
 	"v.io/x/ref/services/syncbase/common"
 	"v.io/x/ref/services/syncbase/server/interfaces"
 	"v.io/x/ref/services/syncbase/store"
 	"v.io/x/ref/services/syncbase/store/watchable"
 	sbwatchable "v.io/x/ref/services/syncbase/watchable"
 )

 var (
 	// watchPrefixes is an in-memory cache of syncgroup prefixes for each
 	// database.  It is filled at startup from persisted syncgroup data
 	// and updated at runtime when syncgroups are joined or left.  It is
 	// not guarded by a mutex because only the watcher goroutine uses it
 	// beyond the startup phase (before any sync goroutines are started).
 	// The map keys are the database ids (globally unique).
 	watchPrefixes = make(map[wire.Id]sgPrefixes)
 )

 // sgPrefixes tracks syncgroup prefixes being synced in a database and their
 // syncgroups.
 type sgPrefixes map[string]sgSet

 // watchStore processes updates obtained by watching the store.  This is the
 // sync watcher goroutine that learns about store updates asynchronously by
 // reading log records that track object mutation histories in each database.
 // For each batch mutation, the watcher updates the sync DAG and log records.
 // When an application makes a single non-transactional put, it is represented
 // as a batch of one log record. Thus the watcher only deals with batches.
 func (s *syncService) watchStore(ctx *context.T) {
 	defer s.pending.Done()

 	ticker := time.NewTicker(watchPollInterval)
 	defer ticker.Stop()

 	ctx, cancel := context.WithCancel(ctx)
 	defer cancel()

 	for !s.isClosed() {
 		select {
 		case <-ticker.C:
 			if s.isClosed() {
 				break
 			}
 			s.processStoreUpdates(ctx)

 		case <-s.closed:
 			break
 		}
 	}

 	vlog.VI(1).Info("sync: watchStore: channel closed, stop watching and exit")
 }

 // processStoreUpdates fetches updates from all databases and processes them.
 // To maintain fairness among databases, it processes one batch update from
 // each database, in a round-robin manner, until there are no further updates
 // from any database.
 func (s *syncService) processStoreUpdates(ctx *context.T) {
 	for {
 		total, active := 0, 0
 		s.forEachDatabaseStore(ctx, func(dbId wire.Id, st *watchable.Store) bool {
 			if s.processDatabase(ctx, dbId, st) {
 				active++
 			}
 			total++
 			return false
 		})

 		vlog.VI(2).Infof("sync: processStoreUpdates: %d/%d databases had updates", active, total)
 		if active == 0 {
 			break
 		}
 	}
 }

 // processDatabase fetches from the given database at most one new batch update
 // (transaction) and processes it.  The one-batch limit prevents one database
 // from starving others.  A batch is stored as a contiguous set of log records
 // ending with one record having the "continued" flag set to false.  The call
 // returns true if a new batch update was processed.
 func (s *syncService) processDatabase(ctx *context.T, dbId wire.Id, st store.Store) bool {
 	s.thLock.Lock()
 	defer s.thLock.Unlock()

 	vlog.VI(2).Infof("sync: processDatabase: begin: %v", dbId)
 	defer vlog.VI(2).Infof("sync: processDatabase: end: %v", dbId)

 	resMark, err := getResMark(ctx, st)
 	if err != nil {
 		if verror.ErrorID(err) != verror.ErrNoExist.ID {
 			vlog.Errorf("sync: processDatabase: %v: cannot get resMark: %v", dbId, err)
 			return false
 		}
 		resMark = watchable.MakeResumeMarker(0)
 	}

 	// Initialize Database sync state if needed.
 	s.initSyncStateInMem(ctx, dbId, "")

 	// Get a batch of watch log entries, if any, after this resume marker.
 	logs, nextResmark, err := watchable.ReadBatchFromLog(st, resMark)
 	if err != nil {
 		// An error here (scan stream cancelled) is possible when the watcher is in
 		// the middle of processing a database when it is destroyed. Hence, we just
 		// ignore this database and proceed.
 		vlog.Errorf("sync: processDatabase: %v: cannot get watch log batch: %v", dbId, verror.DebugString(err))
 		return false
 	}
 	if logs != nil {
 		s.processWatchLogBatch(ctx, dbId, st, logs, nextResmark)
 		return true
 	}
 	return false
 }

 // processWatchLogBatch parses the given batch of watch log records, updates the
 // watchable syncgroup prefixes, uses the prefixes to filter the batch to the
 // subset of syncable records, and transactionally applies these updates to the
 // sync metadata (DAG & log records) and updates the watch resume marker.
 func (s *syncService) processWatchLogBatch(ctx *context.T, dbId wire.Id, st store.Store, logs []*watchable.LogEntry, resMark watch.ResumeMarker) {
 	if len(logs) == 0 {
 		return
 	}

 	if processDbStateChangeLogRecord(ctx, s, st, dbId, logs[0], resMark) {
 		// A batch containing DbStateChange will not have any more records.
 		// This batch is done processing.
 		return
 	}

 	// If the first log entry is a syncgroup prefix operation, then this is
 	// a syncgroup snapshot and not an application batch.  In this case,
 	// handle the syncgroup prefix changes by updating the watch prefixes
 	// and exclude the first entry from the batch.  Also inform the batch
 	// processing below to not assign it a batch ID in the DAG.
 	appBatch := true
 	sgop := processSyncgroupLogRecord(dbId, logs[0])
 	if sgop != nil {
 		appBatch = false
 		logs = logs[1:]
 	}

 	// Filter out the log entries for keys not part of any syncgroup.
 	// Ignore as well log entries made by sync (echo suppression).
 	totalCount := uint64(len(logs))

 	i := 0
 	for _, entry := range logs {
 		if !entry.FromSync && syncable(dbId, entry) {
 			logs[i] = entry
 			i++
 		}
 	}
 	logs = logs[:i]
 	vlog.VI(3).Infof("sync: processWatchLogBatch: %v: sg snap %t, syncable %d, total %d", dbId, !appBatch, len(logs), totalCount)

 	// Transactional processing of the batch: convert these syncable log
 	// records to a batch of sync log records, filling their parent versions
 	// from the DAG head nodes.
 	batch := make([]*LocalLogRec, len(logs))
 	err := store.RunInTransaction(st, func(tx store.Transaction) error {
 		i := 0
 		for _, entry := range logs {
 			if rec, err := convertLogRecord(ctx, tx, entry); err != nil {
 				return err
 			} else if rec != nil {
 				batch[i] = rec
 				i++
 			}
 		}
 		batch = batch[:i]

 		if err := s.processBatch(ctx, dbId, batch, appBatch, totalCount, tx); err != nil {
 			return err
 		}

 		if !appBatch {
 			if err := setSyncgroupWatchable(ctx, tx, sgop); err != nil {
 				return err
 			}
 		}

 		return setResMark(ctx, tx, resMark)
 	})

 	if err != nil {
 		// TODO(rdaoud): quarantine this database for other errors.
 		// There may be an error here if the database is recently
 		// destroyed. Ignore the error and continue to another database.
 		vlog.Errorf("sync: processWatchLogBatch: %v: watcher cannot process batch: %v", dbId, err)
 		return
 	}

 	// Extract blob refs from batch values and update blob metadata.
 	// TODO(rdaoud): the core of this step, extracting blob refs from the
 	// app data, is idempotent and should be done before the transaction
 	// above, which updated the resume marker.  If Syncbase crashes here it
 	// does not re-do this blob ref processing at restart and the metadata
 	// becomes lower quality.  Unfortunately, the log conversion must happen
 	// inside the transaction because it accesses DAG information that must
 	// be in the Tx read-set for optimistic locking.  The TODO is to split
 	// the conversion into two phases, one non-transactional that happens
 	// first outside the transaction, followed by blob ref processing also
 	// outside the transaction (idempotent), then inside the transaction
 	// patch-up the log records in a 2nd phase.
 	if err = s.processWatchBlobRefs(ctx, dbId, st, batch); err != nil {
 		// There may be an error here if the database is recently
 		// destroyed. Ignore the error and continue to another database.
 		vlog.Errorf("sync: processWatchLogBatch:: %v: watcher cannot process blob refs: %v", dbId, err)
 		return
 	}
 }

 // processBatch applies a single batch of changes (object mutations) received
 // from watching a particular Database.
 func (s *syncService) processBatch(ctx *context.T, dbId wire.Id, batch []*LocalLogRec, appBatch bool, totalCount uint64, tx store.Transaction) error {
 	count := uint64(len(batch))
 	if count == 0 {
 		return nil
 	}

 	// If an application batch has more than one mutation, start a batch for it.
 	batchId := NoBatchId
 	if appBatch && totalCount > 1 {
 		batchId = s.startBatch(ctx, tx, batchId)
 		if batchId == NoBatchId {
 			return verror.New(verror.ErrInternal, ctx, "failed to generate batch ID")
 		}
 	}

 	gen, pos := s.reserveGenAndPosInDbLog(ctx, dbId, "", count)

 	vlog.VI(3).Infof("sync: processBatch: %v: len %d, total %d, btid %x, gen %d, pos %d", dbId, count, totalCount, batchId, gen, pos)

 	for _, rec := range batch {
 		// Update the log record. Portions of the record Metadata must
 		// already be filled.
 		rec.Metadata.Id = s.id
 		rec.Metadata.Gen = gen
 		rec.Metadata.RecType = interfaces.NodeRec

 		rec.Metadata.BatchId = batchId
 		rec.Metadata.BatchCount = totalCount

 		rec.Pos = pos

 		gen++
 		pos++

 		if err := s.processLocalLogRec(ctx, tx, rec); err != nil {
 			return verror.New(verror.ErrInternal, ctx, err)
 		}
 	}

 	// End the batch if any.
 	if batchId != NoBatchId {
 		if err := s.endBatch(ctx, tx, batchId, totalCount); err != nil {
 			return err
 		}
 	}

 	return nil
 }

 // processLocalLogRec processes a local log record by adding to the Database and
 // suitably updating the DAG metadata.
 func (s *syncService) processLocalLogRec(ctx *context.T, tx store.Transaction, rec *LocalLogRec) error {
 	// Insert the new log record into the log.
 	if err := putLogRec(ctx, tx, logDataPrefix, rec); err != nil {
 		return err
 	}

 	m := &rec.Metadata
 	logKey := logRecKey(logDataPrefix, m.Id, m.Gen)

 	// Insert the new log record into dag.
 	if err := s.addNode(ctx, tx, m.ObjId, m.CurVers, logKey, m.Delete, m.Parents, m.BatchId, nil); err != nil {
 		return err
 	}

 	// Move the head.
 	return moveHead(ctx, tx, m.ObjId, m.CurVers)
 }

 // processWatchBlobRefs extracts blob refs from the data values of the updates
 // received in the watch batch and updates the blob-to-syncgroup metadata.
 func (s *syncService) processWatchBlobRefs(ctx *context.T, dbId wire.Id, st store.Store, batch []*LocalLogRec) error {
 	if len(batch) == 0 {
 		return nil
 	}

 	sgPfxs := watchPrefixes[dbId]
 	if len(sgPfxs) == 0 {
 		return verror.New(verror.ErrInternal, ctx, "processWatchBlobRefs: no sg prefixes in db", dbId)
 	}

 	for _, rec := range batch {
 		m := &rec.Metadata
 		if m.Delete {
 			continue
 		}

 		buf, err := watchable.GetAtVersion(ctx, st, []byte(m.ObjId), nil, []byte(m.CurVers))
 		if err != nil {
 			return err
 		}

 		if err = s.processBlobRefs(ctx, dbId, st, s.name, true, sgPfxs, nil, m, buf); err != nil {
 			return err
 		}
 	}
 	return nil
 }

 // addWatchPrefixSyncgroup adds a syncgroup prefix-to-ID mapping for an app
 // database in the watch prefix cache.
 func addWatchPrefixSyncgroup(dbId wire.Id, prefix string, gid interfaces.GroupId) {
 	if pfxs := watchPrefixes[dbId]; pfxs != nil {
 		if sgs := pfxs[prefix]; sgs != nil {
 			sgs[gid] = struct{}{}
 		} else {
 			pfxs[prefix] = sgSet{gid: struct{}{}}
 		}
 	} else {
 		watchPrefixes[dbId] = sgPrefixes{prefix: sgSet{gid: struct{}{}}}
 	}
 }

 // rmWatchPrefixSyncgroup removes a syncgroup prefix-to-ID mapping for an app
 // database in the watch prefix cache.
 func rmWatchPrefixSyncgroup(dbId wire.Id, prefix string, gid interfaces.GroupId) {
 	if pfxs := watchPrefixes[dbId]; pfxs != nil {
 		if sgs := pfxs[prefix]; sgs != nil {
 			delete(sgs, gid)
 			if len(sgs) == 0 {
 				delete(pfxs, prefix)
 				if len(pfxs) == 0 {
 					delete(watchPrefixes, dbId)
 				}
 			}
 		}
 	}
 }

 // setSyncgroupWatchable sets the local watchable state of the syncgroup.
 func setSyncgroupWatchable(ctx *context.T, tx store.Transaction, sgop *sbwatchable.SyncgroupOp) error {
 	state, err := getSGIdEntry(ctx, tx, sgop.SgId)
 	if err != nil {
 		return err
 	}
 	state.Watched = !sgop.Remove

 	return setSGIdEntry(ctx, tx, sgop.SgId, state)
 }

 // convertLogRecord converts a store log entry to a sync log record.  It fills
 // the previous object version (parent) by fetching its current DAG head if it
 // has one.  For a delete, it generates a new object version because the store
 // does not version a deletion.
 // TODO(rdaoud): change Syncbase to store and version a deleted object to
 // simplify the store-to-sync interaction.  A deleted key would still have a
 // version and its value entry would encode the "deleted" flag, either in the
 // key or probably in a value wrapper that would contain other metadata.
 func convertLogRecord(ctx *context.T, tx store.Transaction, logEnt *watchable.LogEntry) (*LocalLogRec, error) {
 	var rec *LocalLogRec
 	timestamp := logEnt.CommitTimestamp

 	var op interface{}
 	if err := logEnt.Op.ToValue(&op); err != nil {
 		return nil, err
 	}
 	switch op := op.(type) {
 	case *watchable.GetOp:
 		// TODO(rdaoud): save read-set in sync.

 	case *watchable.ScanOp:
 		// TODO(rdaoud): save scan-set in sync.

 	case *watchable.PutOp:
 		rec = newLocalLogRec(ctx, tx, op.Key, op.Version, false, timestamp)

 	case *sbwatchable.SyncSnapshotOp:
 		// Create records for object versions not already in the DAG.
 		// Duplicates can appear here in cases of nested syncgroups or
 		// peer syncgroups.
 		if ok, err := hasNode(ctx, tx, string(op.Key), string(op.Version)); err != nil {
 			return nil, err
 		} else if !ok {
 			rec = newLocalLogRec(ctx, tx, op.Key, op.Version, false, timestamp)
 		}

 	case *watchable.DeleteOp:
 		rec = newLocalLogRec(ctx, tx, op.Key, watchable.NewVersion(), true, timestamp)

 	case *sbwatchable.SyncgroupOp:
 		vlog.Errorf("sync: convertLogRecord: watch LogEntry for syncgroup should not be converted: %v", logEnt)
 		return nil, verror.New(verror.ErrInternal, ctx, "cannot convert a watch log OpSyncgroup entry")

 	default:
 		vlog.Errorf("sync: convertLogRecord: invalid watch LogEntry: %v", logEnt)
 		return nil, verror.New(verror.ErrInternal, ctx, "cannot convert unknown watch log entry")
 	}

 	return rec, nil
 }

 // newLocalLogRec creates a local sync log record given its information: key,
 // version, deletion flag, and timestamp.  It retrieves the current DAG head
 // for the key (if one exists) to use as its parent (previous) version.
 func newLocalLogRec(ctx *context.T, tx store.Transaction, key, version []byte, deleted bool, timestamp int64) *LocalLogRec {
 	rec := LocalLogRec{}
 	oid := string(key)

 	rec.Metadata.ObjId = oid
 	rec.Metadata.CurVers = string(version)
 	rec.Metadata.Delete = deleted
 	if head, err := getHead(ctx, tx, oid); err == nil {
 		rec.Metadata.Parents = []string{head}
 	} else if deleted || (verror.ErrorID(err) != verror.ErrNoExist.ID) {
 		vlog.Fatalf("sync: newLocalLogRec: cannot getHead to convert log record for %s: %v", oid, err)
 	}
 	rec.Metadata.UpdTime = unixNanoToTime(timestamp)
 	return &rec
 }

 // processDbStateChangeLogRecord checks if the log entry is a
 // DbStateChangeRequest and if so, it executes the state change request
 // appropriately.
 // TODO(razvanm): change the return type to error.
 func processDbStateChangeLogRecord(ctx *context.T, s *syncService, st store.Store, dbId wire.Id, logEnt *watchable.LogEntry, resMark watch.ResumeMarker) bool {
 	var op interface{}
 	if err := logEnt.Op.ToValue(&op); err != nil {
 		vlog.Fatalf("sync: processDbStateChangeLogRecord: %v: bad VOM: %v", dbId, err)
 	}
 	switch op := op.(type) {
 	case *sbwatchable.DbStateChangeRequestOp:
 		dbStateChangeRequest := op
 		vlog.VI(1).Infof("sync: processDbStateChangeLogRecord: found a dbState change log record with state %#v", dbStateChangeRequest)
 		isPaused := false
 		if err := store.RunInTransaction(st, func(tx store.Transaction) error {
 			switch dbStateChangeRequest.RequestType {
 			case sbwatchable.StateChangePauseSync:
 				vlog.VI(1).Infof("sync: processDbStateChangeLogRecord: PauseSync request found. Pausing sync.")
 				isPaused = true
 			case sbwatchable.StateChangeResumeSync:
 				vlog.VI(1).Infof("sync: processDbStateChangeLogRecord: ResumeSync request found. Resuming sync.")
 				isPaused = false
 			default:
 				return fmt.Errorf("Unexpected DbStateChangeRequest found: %#v", dbStateChangeRequest)
 			}
 			// Update isPaused state in db.
 			if err := s.updateDbPauseSt(ctx, tx, dbId, isPaused); err != nil {
 				return err
 			}
 			return setResMark(ctx, tx, resMark)
 		}); err != nil {
 			// TODO(rdaoud): don't crash, quarantine this database.
 			vlog.Fatalf("sync: processDbStateChangeLogRecord: %v: watcher failed to reset dbState bits: %v", dbId, err)
 		}
 		// Update isPaused state in cache.
 		s.updateInMemoryPauseSt(ctx, dbId, isPaused)
 		return true

 	default:
 		return false
 	}
 }

 // processSyncgroupLogRecord checks if the log entry is a syncgroup update and,
 // if it is, updates the watch prefixes for the database and returns a syncgroup
 // operation.  Otherwise it returns nil with no other changes.
 // TODO(razvanm): change the return to also include an error.
 func processSyncgroupLogRecord(dbId wire.Id, logEnt *watchable.LogEntry) *sbwatchable.SyncgroupOp {
 	var op interface{}
 	if err := logEnt.Op.ToValue(&op); err != nil {
 		vlog.Fatalf("sync: processSyncgroupLogRecord: %v: bad VOM: %v", dbId, err)
 	}
 	switch op := op.(type) {
 	case *sbwatchable.SyncgroupOp:
 		gid, remove := op.SgId, op.Remove
 		for _, prefix := range op.Prefixes {
 			if remove {
 				rmWatchPrefixSyncgroup(dbId, prefix, gid)
 			} else {
 				addWatchPrefixSyncgroup(dbId, prefix, gid)
 			}
 		}
 		vlog.VI(3).Infof("sync: processSyncgroupLogRecord: %v: gid %d, remove %t, prefixes: %q", dbId, gid, remove, op.Prefixes)
 		return op

 	default:
 		return nil
 	}
 }

 // syncable returns true if the given log entry falls within the scope of a
 // syncgroup prefix for the given database, and thus should be synced. It is
 // used to pre-filter the batch of log entries before sync processing.
 // TODO(razvanm): change the return type to error.
 func syncable(dbId wire.Id, logEnt *watchable.LogEntry) bool {
 	var key string
 	var op interface{}
 	if err := logEnt.Op.ToValue(&op); err != nil {
 		vlog.Fatalf("sync: syncable: %v: bad VOM: %v", dbId, err)
 	}
 	switch op := op.(type) {
 	case *watchable.PutOp:
 		key = string(op.Key)
 	case *watchable.DeleteOp:
 		key = string(op.Key)
 	case *sbwatchable.SyncSnapshotOp:
 		key = string(op.Key)
 	default:
 		return false
 	}

 	// The key starts with one of the store's reserved prefixes for managed
 	// namespaces (e.g. "r" for row, "c" for collection perms). Remove that
 	// prefix before comparing it with the syncgroup prefixes which are defined
 	// by the application.
 	key = common.StripFirstKeyPartOrDie(key)

 	for prefix := range watchPrefixes[dbId] {
 		if strings.HasPrefix(key, prefix) {
 			return true
 		}
 	}
 	return false
 }

 // resMarkKey returns the key used to access the watcher resume marker.
 func resMarkKey() string {
 	return common.JoinKeyParts(common.SyncPrefix, "w", "rm")
 }

 // setResMark stores the watcher resume marker for a database.
 func setResMark(ctx *context.T, tx store.Transaction, resMark watch.ResumeMarker) error {
 	return store.Put(ctx, tx, resMarkKey(), resMark)
 }

 // getResMark retrieves the watcher resume marker for a database.
 func getResMark(ctx *context.T, st store.StoreReader) (watch.ResumeMarker, error) {
 	var resMark watch.ResumeMarker
 	key := resMarkKey()
 	if err := store.Get(ctx, st, key, &resMark); err != nil {
 		return nil, err
 	}
 	return resMark, nil
 }
	// Copyright 2015 The Vanadium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package vsync

	// Syncbase Watcher is a goroutine that listens to local Database updates from
	// applications and modifies sync metadata (e.g. DAG and local log records).
	// The coupling between Syncbase storage and sync is loose, via asynchronous
	// listening by the Watcher, to unblock the application operations as soon as
	// possible, and offload the sync metadata update to the Watcher. When the
	// application mutates objects in a Database, additional entries are written
	// to a log queue, persisted in the same Database. This queue is read by the
	// sync Watcher to learn of the changes.

	import (
	"fmt"
	"strings"
	"time"

	"v.io/v23/context"
	wire "v.io/v23/services/syncbase"
	"v.io/v23/services/watch"
	"v.io/v23/verror"
	"v.io/x/lib/vlog"
	"v.io/x/ref/services/syncbase/common"
	"v.io/x/ref/services/syncbase/server/interfaces"
	"v.io/x/ref/services/syncbase/store"
	"v.io/x/ref/services/syncbase/store/watchable"
	sbwatchable "v.io/x/ref/services/syncbase/watchable"
	)

	var (
	// watchPrefixes is an in-memory cache of syncgroup prefixes for each
	// database. It is filled at startup from persisted syncgroup data
	// and updated at runtime when syncgroups are joined or left. It is
	// not guarded by a mutex because only the watcher goroutine uses it
	// beyond the startup phase (before any sync goroutines are started).
	// The map keys are the database ids (globally unique).
	watchPrefixes = make(map[wire.Id]sgPrefixes)
	)

	// sgPrefixes tracks syncgroup prefixes being synced in a database and their
	// syncgroups.
	type sgPrefixes map[string]sgSet

	// watchStore processes updates obtained by watching the store. This is the
	// sync watcher goroutine that learns about store updates asynchronously by
	// reading log records that track object mutation histories in each database.
	// For each batch mutation, the watcher updates the sync DAG and log records.
	// When an application makes a single non-transactional put, it is represented
	// as a batch of one log record. Thus the watcher only deals with batches.
	func (s syncService) watchStore(ctx context.T) {
	defer s.pending.Done()

	ticker := time.NewTicker(watchPollInterval)
	defer ticker.Stop()

	ctx, cancel := context.WithCancel(ctx)
	defer cancel()

	for !s.isClosed() {
	select {
	case <-ticker.C:
	if s.isClosed() {
	break
	}
	s.processStoreUpdates(ctx)

	case <-s.closed:
	break
	}
	}

	vlog.VI(1).Info("sync: watchStore: channel closed, stop watching and exit")
	}

	// processStoreUpdates fetches updates from all databases and processes them.
	// To maintain fairness among databases, it processes one batch update from
	// each database, in a round-robin manner, until there are no further updates
	// from any database.
	func (s syncService) processStoreUpdates(ctx context.T) {
	for {
	total, active := 0, 0
	s.forEachDatabaseStore(ctx, func(dbId wire.Id, st *watchable.Store) bool {
	if s.processDatabase(ctx, dbId, st) {
	active++
	}
	total++
	return false
	})

	vlog.VI(2).Infof("sync: processStoreUpdates: %d/%d databases had updates", active, total)
	if active == 0 {
	break
	}
	}
	}

	// processDatabase fetches from the given database at most one new batch update
	// (transaction) and processes it. The one-batch limit prevents one database
	// from starving others. A batch is stored as a contiguous set of log records
	// ending with one record having the "continued" flag set to false. The call
	// returns true if a new batch update was processed.
	func (s syncService) processDatabase(ctx context.T, dbId wire.Id, st store.Store) bool {
	s.thLock.Lock()
	defer s.thLock.Unlock()

	vlog.VI(2).Infof("sync: processDatabase: begin: %v", dbId)
	defer vlog.VI(2).Infof("sync: processDatabase: end: %v", dbId)

	resMark, err := getResMark(ctx, st)
	if err != nil {
	if verror.ErrorID(err) != verror.ErrNoExist.ID {
	vlog.Errorf("sync: processDatabase: %v: cannot get resMark: %v", dbId, err)
	return false
	}
	resMark = watchable.MakeResumeMarker(0)
	}

	// Initialize Database sync state if needed.
	s.initSyncStateInMem(ctx, dbId, "")

	// Get a batch of watch log entries, if any, after this resume marker.
	logs, nextResmark, err := watchable.ReadBatchFromLog(st, resMark)
	if err != nil {
	// An error here (scan stream cancelled) is possible when the watcher is in
	// the middle of processing a database when it is destroyed. Hence, we just
	// ignore this database and proceed.
	vlog.Errorf("sync: processDatabase: %v: cannot get watch log batch: %v", dbId, verror.DebugString(err))
	return false
	}
	if logs != nil {
	s.processWatchLogBatch(ctx, dbId, st, logs, nextResmark)
	return true
	}
	return false
	}

	// processWatchLogBatch parses the given batch of watch log records, updates the
	// watchable syncgroup prefixes, uses the prefixes to filter the batch to the
	// subset of syncable records, and transactionally applies these updates to the
	// sync metadata (DAG & log records) and updates the watch resume marker.
	func (s syncService) processWatchLogBatch(ctx context.T, dbId wire.Id, st store.Store, logs []*watchable.LogEntry, resMark watch.ResumeMarker) {
	if len(logs) == 0 {
	return
	}

	if processDbStateChangeLogRecord(ctx, s, st, dbId, logs[0], resMark) {
	// A batch containing DbStateChange will not have any more records.
	// This batch is done processing.
	return
	}

	// If the first log entry is a syncgroup prefix operation, then this is
	// a syncgroup snapshot and not an application batch. In this case,
	// handle the syncgroup prefix changes by updating the watch prefixes
	// and exclude the first entry from the batch. Also inform the batch
	// processing below to not assign it a batch ID in the DAG.
	appBatch := true
	sgop := processSyncgroupLogRecord(dbId, logs[0])
	if sgop != nil {
	appBatch = false
	logs = logs[1:]
	}

	// Filter out the log entries for keys not part of any syncgroup.
	// Ignore as well log entries made by sync (echo suppression).
	totalCount := uint64(len(logs))

	i := 0
	for _, entry := range logs {
	if !entry.FromSync && syncable(dbId, entry) {
	logs[i] = entry
	i++
	}
	}
	logs = logs[:i]
	vlog.VI(3).Infof("sync: processWatchLogBatch: %v: sg snap %t, syncable %d, total %d", dbId, !appBatch, len(logs), totalCount)

	// Transactional processing of the batch: convert these syncable log
	// records to a batch of sync log records, filling their parent versions
	// from the DAG head nodes.
	batch := make([]*LocalLogRec, len(logs))
	err := store.RunInTransaction(st, func(tx store.Transaction) error {
	i := 0
	for _, entry := range logs {
	if rec, err := convertLogRecord(ctx, tx, entry); err != nil {
	return err
	} else if rec != nil {
	batch[i] = rec
	i++
	}
	}
	batch = batch[:i]

	if err := s.processBatch(ctx, dbId, batch, appBatch, totalCount, tx); err != nil {
	return err
	}

	if !appBatch {
	if err := setSyncgroupWatchable(ctx, tx, sgop); err != nil {
	return err
	}
	}

	return setResMark(ctx, tx, resMark)
	})

	if err != nil {
	// TODO(rdaoud): quarantine this database for other errors.
	// There may be an error here if the database is recently
	// destroyed. Ignore the error and continue to another database.
	vlog.Errorf("sync: processWatchLogBatch: %v: watcher cannot process batch: %v", dbId, err)
	return
	}

	// Extract blob refs from batch values and update blob metadata.
	// TODO(rdaoud): the core of this step, extracting blob refs from the
	// app data, is idempotent and should be done before the transaction
	// above, which updated the resume marker. If Syncbase crashes here it
	// does not re-do this blob ref processing at restart and the metadata
	// becomes lower quality. Unfortunately, the log conversion must happen
	// inside the transaction because it accesses DAG information that must
	// be in the Tx read-set for optimistic locking. The TODO is to split
	// the conversion into two phases, one non-transactional that happens
	// first outside the transaction, followed by blob ref processing also
	// outside the transaction (idempotent), then inside the transaction
	// patch-up the log records in a 2nd phase.
	if err = s.processWatchBlobRefs(ctx, dbId, st, batch); err != nil {
	// There may be an error here if the database is recently
	// destroyed. Ignore the error and continue to another database.
	vlog.Errorf("sync: processWatchLogBatch:: %v: watcher cannot process blob refs: %v", dbId, err)
	return
	}
	}

	// processBatch applies a single batch of changes (object mutations) received
	// from watching a particular Database.
	func (s syncService) processBatch(ctx context.T, dbId wire.Id, batch []*LocalLogRec, appBatch bool, totalCount uint64, tx store.Transaction) error {
	count := uint64(len(batch))
	if count == 0 {
	return nil
	}

	// If an application batch has more than one mutation, start a batch for it.
	batchId := NoBatchId
	if appBatch && totalCount > 1 {
	batchId = s.startBatch(ctx, tx, batchId)
	if batchId == NoBatchId {
	return verror.New(verror.ErrInternal, ctx, "failed to generate batch ID")
	}
	}

	gen, pos := s.reserveGenAndPosInDbLog(ctx, dbId, "", count)

	vlog.VI(3).Infof("sync: processBatch: %v: len %d, total %d, btid %x, gen %d, pos %d", dbId, count, totalCount, batchId, gen, pos)

	for _, rec := range batch {
	// Update the log record. Portions of the record Metadata must
	// already be filled.
	rec.Metadata.Id = s.id
	rec.Metadata.Gen = gen
	rec.Metadata.RecType = interfaces.NodeRec

	rec.Metadata.BatchId = batchId
	rec.Metadata.BatchCount = totalCount

	rec.Pos = pos

	gen++
	pos++

	if err := s.processLocalLogRec(ctx, tx, rec); err != nil {
	return verror.New(verror.ErrInternal, ctx, err)
	}
	}

	// End the batch if any.
	if batchId != NoBatchId {
	if err := s.endBatch(ctx, tx, batchId, totalCount); err != nil {
	return err
	}
	}

	return nil
	}

	// processLocalLogRec processes a local log record by adding to the Database and
	// suitably updating the DAG metadata.
	func (s syncService) processLocalLogRec(ctx context.T, tx store.Transaction, rec *LocalLogRec) error {
	// Insert the new log record into the log.
	if err := putLogRec(ctx, tx, logDataPrefix, rec); err != nil {
	return err
	}

	m := &rec.Metadata
	logKey := logRecKey(logDataPrefix, m.Id, m.Gen)

	// Insert the new log record into dag.
	if err := s.addNode(ctx, tx, m.ObjId, m.CurVers, logKey, m.Delete, m.Parents, m.BatchId, nil); err != nil {
	return err
	}

	// Move the head.
	return moveHead(ctx, tx, m.ObjId, m.CurVers)
	}

	// processWatchBlobRefs extracts blob refs from the data values of the updates
	// received in the watch batch and updates the blob-to-syncgroup metadata.
	func (s syncService) processWatchBlobRefs(ctx context.T, dbId wire.Id, st store.Store, batch []*LocalLogRec) error {
	if len(batch) == 0 {
	return nil
	}

	sgPfxs := watchPrefixes[dbId]
	if len(sgPfxs) == 0 {
	return verror.New(verror.ErrInternal, ctx, "processWatchBlobRefs: no sg prefixes in db", dbId)
	}

	for _, rec := range batch {
	m := &rec.Metadata
	if m.Delete {
	continue
	}

	buf, err := watchable.GetAtVersion(ctx, st, []byte(m.ObjId), nil, []byte(m.CurVers))
	if err != nil {
	return err
	}

	if err = s.processBlobRefs(ctx, dbId, st, s.name, true, sgPfxs, nil, m, buf); err != nil {
	return err
	}
	}
	return nil
	}

	// addWatchPrefixSyncgroup adds a syncgroup prefix-to-ID mapping for an app
	// database in the watch prefix cache.
	func addWatchPrefixSyncgroup(dbId wire.Id, prefix string, gid interfaces.GroupId) {
	if pfxs := watchPrefixes[dbId]; pfxs != nil {
	if sgs := pfxs[prefix]; sgs != nil {
	sgs[gid] = struct{}{}
	} else {
	pfxs[prefix] = sgSet{gid: struct{}{}}
	}
	} else {
	watchPrefixes[dbId] = sgPrefixes{prefix: sgSet{gid: struct{}{}}}
	}
	}

	// rmWatchPrefixSyncgroup removes a syncgroup prefix-to-ID mapping for an app
	// database in the watch prefix cache.
	func rmWatchPrefixSyncgroup(dbId wire.Id, prefix string, gid interfaces.GroupId) {
	if pfxs := watchPrefixes[dbId]; pfxs != nil {
	if sgs := pfxs[prefix]; sgs != nil {
	delete(sgs, gid)
	if len(sgs) == 0 {
	delete(pfxs, prefix)
	if len(pfxs) == 0 {
	delete(watchPrefixes, dbId)
	}
	}
	}
	}
	}

	// setSyncgroupWatchable sets the local watchable state of the syncgroup.
	func setSyncgroupWatchable(ctx context.T, tx store.Transaction, sgop sbwatchable.SyncgroupOp) error {
	state, err := getSGIdEntry(ctx, tx, sgop.SgId)
	if err != nil {
	return err
	}
	state.Watched = !sgop.Remove

	return setSGIdEntry(ctx, tx, sgop.SgId, state)
	}

	// convertLogRecord converts a store log entry to a sync log record. It fills
	// the previous object version (parent) by fetching its current DAG head if it
	// has one. For a delete, it generates a new object version because the store
	// does not version a deletion.
	// TODO(rdaoud): change Syncbase to store and version a deleted object to
	// simplify the store-to-sync interaction. A deleted key would still have a
	// version and its value entry would encode the "deleted" flag, either in the
	// key or probably in a value wrapper that would contain other metadata.
	func convertLogRecord(ctx context.T, tx store.Transaction, logEnt watchable.LogEntry) (*LocalLogRec, error) {
	var rec *LocalLogRec
	timestamp := logEnt.CommitTimestamp

	var op interface{}
	if err := logEnt.Op.ToValue(&op); err != nil {
	return nil, err
	}
	switch op := op.(type) {
	case *watchable.GetOp:
	// TODO(rdaoud): save read-set in sync.

	case *watchable.ScanOp:
	// TODO(rdaoud): save scan-set in sync.

	case *watchable.PutOp:
	rec = newLocalLogRec(ctx, tx, op.Key, op.Version, false, timestamp)

	case *sbwatchable.SyncSnapshotOp:
	// Create records for object versions not already in the DAG.
	// Duplicates can appear here in cases of nested syncgroups or
	// peer syncgroups.
	if ok, err := hasNode(ctx, tx, string(op.Key), string(op.Version)); err != nil {
	return nil, err
	} else if !ok {
	rec = newLocalLogRec(ctx, tx, op.Key, op.Version, false, timestamp)
	}

	case *watchable.DeleteOp:
	rec = newLocalLogRec(ctx, tx, op.Key, watchable.NewVersion(), true, timestamp)

	case *sbwatchable.SyncgroupOp:
	vlog.Errorf("sync: convertLogRecord: watch LogEntry for syncgroup should not be converted: %v", logEnt)
	return nil, verror.New(verror.ErrInternal, ctx, "cannot convert a watch log OpSyncgroup entry")

	default:
	vlog.Errorf("sync: convertLogRecord: invalid watch LogEntry: %v", logEnt)
	return nil, verror.New(verror.ErrInternal, ctx, "cannot convert unknown watch log entry")
	}

	return rec, nil
	}

	// newLocalLogRec creates a local sync log record given its information: key,
	// version, deletion flag, and timestamp. It retrieves the current DAG head
	// for the key (if one exists) to use as its parent (previous) version.
	func newLocalLogRec(ctx context.T, tx store.Transaction, key, version []byte, deleted bool, timestamp int64) LocalLogRec {
	rec := LocalLogRec{}
	oid := string(key)

	rec.Metadata.ObjId = oid
	rec.Metadata.CurVers = string(version)
	rec.Metadata.Delete = deleted
	if head, err := getHead(ctx, tx, oid); err == nil {
	rec.Metadata.Parents = []string{head}
	} else if deleted \|\| (verror.ErrorID(err) != verror.ErrNoExist.ID) {
	vlog.Fatalf("sync: newLocalLogRec: cannot getHead to convert log record for %s: %v", oid, err)
	}
	rec.Metadata.UpdTime = unixNanoToTime(timestamp)
	return &rec
	}

	// processDbStateChangeLogRecord checks if the log entry is a
	// DbStateChangeRequest and if so, it executes the state change request
	// appropriately.
	// TODO(razvanm): change the return type to error.
	func processDbStateChangeLogRecord(ctx context.T, s syncService, st store.Store, dbId wire.Id, logEnt *watchable.LogEntry, resMark watch.ResumeMarker) bool {
	var op interface{}
	if err := logEnt.Op.ToValue(&op); err != nil {
	vlog.Fatalf("sync: processDbStateChangeLogRecord: %v: bad VOM: %v", dbId, err)
	}
	switch op := op.(type) {
	case *sbwatchable.DbStateChangeRequestOp:
	dbStateChangeRequest := op
	vlog.VI(1).Infof("sync: processDbStateChangeLogRecord: found a dbState change log record with state %#v", dbStateChangeRequest)
	isPaused := false
	if err := store.RunInTransaction(st, func(tx store.Transaction) error {
	switch dbStateChangeRequest.RequestType {
	case sbwatchable.StateChangePauseSync:
	vlog.VI(1).Infof("sync: processDbStateChangeLogRecord: PauseSync request found. Pausing sync.")
	isPaused = true
	case sbwatchable.StateChangeResumeSync:
	vlog.VI(1).Infof("sync: processDbStateChangeLogRecord: ResumeSync request found. Resuming sync.")
	isPaused = false
	default:
	return fmt.Errorf("Unexpected DbStateChangeRequest found: %#v", dbStateChangeRequest)
	}
	// Update isPaused state in db.
	if err := s.updateDbPauseSt(ctx, tx, dbId, isPaused); err != nil {
	return err
	}
	return setResMark(ctx, tx, resMark)
	}); err != nil {
	// TODO(rdaoud): don't crash, quarantine this database.
	vlog.Fatalf("sync: processDbStateChangeLogRecord: %v: watcher failed to reset dbState bits: %v", dbId, err)
	}
	// Update isPaused state in cache.
	s.updateInMemoryPauseSt(ctx, dbId, isPaused)
	return true

	default:
	return false
	}
	}

	// processSyncgroupLogRecord checks if the log entry is a syncgroup update and,
	// if it is, updates the watch prefixes for the database and returns a syncgroup
	// operation. Otherwise it returns nil with no other changes.
	// TODO(razvanm): change the return to also include an error.
	func processSyncgroupLogRecord(dbId wire.Id, logEnt watchable.LogEntry) sbwatchable.SyncgroupOp {
	var op interface{}
	if err := logEnt.Op.ToValue(&op); err != nil {
	vlog.Fatalf("sync: processSyncgroupLogRecord: %v: bad VOM: %v", dbId, err)
	}
	switch op := op.(type) {
	case *sbwatchable.SyncgroupOp:
	gid, remove := op.SgId, op.Remove
	for _, prefix := range op.Prefixes {
	if remove {
	rmWatchPrefixSyncgroup(dbId, prefix, gid)
	} else {
	addWatchPrefixSyncgroup(dbId, prefix, gid)
	}
	}
	vlog.VI(3).Infof("sync: processSyncgroupLogRecord: %v: gid %d, remove %t, prefixes: %q", dbId, gid, remove, op.Prefixes)
	return op

	default:
	return nil
	}
	}

	// syncable returns true if the given log entry falls within the scope of a
	// syncgroup prefix for the given database, and thus should be synced. It is
	// used to pre-filter the batch of log entries before sync processing.
	// TODO(razvanm): change the return type to error.
	func syncable(dbId wire.Id, logEnt *watchable.LogEntry) bool {
	var key string
	var op interface{}
	if err := logEnt.Op.ToValue(&op); err != nil {
	vlog.Fatalf("sync: syncable: %v: bad VOM: %v", dbId, err)
	}
	switch op := op.(type) {
	case *watchable.PutOp:
	key = string(op.Key)
	case *watchable.DeleteOp:
	key = string(op.Key)
	case *sbwatchable.SyncSnapshotOp:
	key = string(op.Key)
	default:
	return false
	}

	// The key starts with one of the store's reserved prefixes for managed
	// namespaces (e.g. "r" for row, "c" for collection perms). Remove that
	// prefix before comparing it with the syncgroup prefixes which are defined
	// by the application.
	key = common.StripFirstKeyPartOrDie(key)

	for prefix := range watchPrefixes[dbId] {
	if strings.HasPrefix(key, prefix) {
	return true
	}
	}
	return false
	}

	// resMarkKey returns the key used to access the watcher resume marker.
	func resMarkKey() string {
	return common.JoinKeyParts(common.SyncPrefix, "w", "rm")
	}

	// setResMark stores the watcher resume marker for a database.
	func setResMark(ctx *context.T, tx store.Transaction, resMark watch.ResumeMarker) error {
	return store.Put(ctx, tx, resMarkKey(), resMark)
	}

	// getResMark retrieves the watcher resume marker for a database.
	func getResMark(ctx *context.T, st store.StoreReader) (watch.ResumeMarker, error) {
	var resMark watch.ResumeMarker
	key := resMarkKey()
	if err := store.Get(ctx, st, key, &resMark); err != nil {
	return nil, err
	}
	return resMark, nil
	}