services/mounttable/btmtd: Add row id

Before this change, we implicitly relied on createChild() never being
called concurrently within the same millisecond. In reality, this can
happen and does happen once in a while in the tests.

With this change, each row has its own 4-byte ID and the parent uses
both the child's ID and its name to reference it.

(This is not backward compatible)

Change-Id: Ib0334a25caa98307cf1eeed4932454c5721310cc
diff --git a/services/mounttable/btmtd/README.md b/services/mounttable/btmtd/README.md
index 8de7315..7801f37 100644
--- a/services/mounttable/btmtd/README.md
+++ b/services/mounttable/btmtd/README.md
@@ -19,6 +19,7 @@
 The table has three column families:
 
    * Metadata `m`: used to store information about the row:
+      * ID `i`: A 4-byte ID for the node. Doesn't have to be globally unique.
       * Version `v`: The version changes every time the row is mutated. It is
         used to detect conflicts related to concurrent access.
       * Creator `c`: The cell's value is the name of its creator and its
@@ -34,11 +35,13 @@
 
 Example:
 
-| Key              | Version | Creator    | Sticky | Permissions  | Mounted Server...           | Child...  |
-| ---              | ---     | ---        | ---    | ---          | ---                         | ---       |
-| 540f1a56/        | 54321   | user (ts)  | 1      | {"Admin":... |                             | foo (ts1) |
-| 1234abcd/foo     | 123     | user (ts1) |        | {"Admin":... |                             | bar (ts2) |
-| 46d523e3/foo/bar | 5436    | user (ts2) |        | {"Admin":... | /example.com:123 (deadline) |           |
+| Key              | ID  | Version | Creator | Sticky | Permissions  | Mounted Server...           | Child... |
+| ---              | --- | ---     | ---     | ---    | ---          | ---                         | ---      |
+| 540f1a56/        | id1 | 54321   | user    | 1      | {"Admin":... |                             | (id2)foo |
+| 1234abcd/foo     | id2 | 123     | user    |        | {"Admin":... |                             | (id3)bar |
+| 46d523e3/foo/bar | id3 | 5436    | user    |        | {"Admin":... | /example.com:123 (deadline) |          |
+
+Counters are stored in another table, one row with one column per counter.
 
 ## Mutations
 
diff --git a/services/mounttable/btmtd/internal/bt.go b/services/mounttable/btmtd/internal/bt.go
index 3769523..9396673 100644
--- a/services/mounttable/btmtd/internal/bt.go
+++ b/services/mounttable/btmtd/internal/bt.go
@@ -9,7 +9,6 @@
 	"fmt"
 	"io/ioutil"
 	"math/rand"
-	"strconv"
 	"strings"
 	"time"
 
@@ -37,6 +36,7 @@
 	serversFamily  = "s"
 	childrenFamily = "c"
 
+	idColumn          = "i"
 	creatorColumn     = "c"
 	permissionsColumn = "p"
 	stickyColumn      = "s"
@@ -162,7 +162,11 @@
 	}
 	perms := make(access.Permissions)
 	perms.Add(security.AllPrincipals, string(v23mt.Admin))
-	return b.createRow(ctx, "", perms, "", b.now(), 0)
+	child, err := newChild("ROOTNODE", "")
+	if err != nil {
+		return err
+	}
+	return b.createRow(ctx, "", perms, "", child, 0)
 }
 
 func (b *BigTable) timeFloor(t bigtable.Timestamp) bigtable.Timestamp {
@@ -174,14 +178,6 @@
 	return (t / 1000) * 1000
 }
 
-func (b *BigTable) timeNext(t bigtable.Timestamp) bigtable.Timestamp {
-	return (t/1000 + 1) * 1000
-}
-
-func (b *BigTable) now() bigtable.Timestamp {
-	return b.timeFloor(bigtable.Now())
-}
-
 func (b *BigTable) time(t time.Time) bigtable.Timestamp {
 	return b.timeFloor(bigtable.Time(t))
 }
@@ -227,8 +223,12 @@
 				}
 				fmt.Printf(" flags: %+v", n.mountFlags)
 			}
-			if len(n.children) > 0 {
-				fmt.Printf(" children: [%s]", strings.Join(n.children, " "))
+			children := make([]string, len(n.children))
+			for i, c := range n.children {
+				children[i] = c.name()
+			}
+			if len(children) > 0 {
+				fmt.Printf(" children: [%s]", strings.Join(children, " "))
 			}
 			fmt.Println()
 			return true
@@ -342,7 +342,7 @@
 	return row, err
 }
 
-func (b *BigTable) createRow(ctx *context.T, name string, perms access.Permissions, creator string, ts bigtable.Timestamp, limit int64) error {
+func (b *BigTable) createRow(ctx *context.T, name string, perms access.Permissions, creator string, ch child, limit int64) error {
 	jsonPerms, err := json.Marshal(perms)
 	if err != nil {
 		return err
@@ -360,9 +360,10 @@
 		}
 	}()
 	mut := bigtable.NewMutation()
-	mut.Set(metadataFamily, creatorColumn, ts, []byte(creator))
+	mut.Set(metadataFamily, idColumn, bigtable.ServerTime, []byte(ch.id()))
+	mut.Set(metadataFamily, creatorColumn, bigtable.ServerTime, []byte(creator))
 	mut.Set(metadataFamily, permissionsColumn, bigtable.ServerTime, jsonPerms)
-	mut.Set(metadataFamily, versionColumn, bigtable.ServerTime, []byte(strconv.FormatUint(uint64(rand.Uint32()), 10)))
+	mut.Set(metadataFamily, versionColumn, bigtable.ServerTime, []byte(fmt.Sprintf("%08x", rand.Uint32())))
 
 	filter := bigtable.ChainFilters(bigtable.FamilyFilter(metadataFamily), bigtable.ColumnFilter(creatorColumn))
 	condMut := bigtable.NewCondMutation(filter, nil, mut)
diff --git a/services/mounttable/btmtd/internal/mounttable.go b/services/mounttable/btmtd/internal/mounttable.go
index e7c1406..e3f1388 100644
--- a/services/mounttable/btmtd/internal/mounttable.go
+++ b/services/mounttable/btmtd/internal/mounttable.go
@@ -237,10 +237,10 @@
 		}
 		matcher, left := state.g.Head(), state.g.Tail()
 		for _, child := range n.children {
-			if matcher.Match(child) {
+			if matcher.Match(child.name()) {
 				// TODO(rthellend): We should protect against large query results
 				// that could blow up memory usage.
-				queue = append(queue, gState{naming.Join(state.name, child), left})
+				queue = append(queue, gState{naming.Join(state.name, child.name()), left})
 			}
 		}
 	}
diff --git a/services/mounttable/btmtd/internal/node.go b/services/mounttable/btmtd/internal/node.go
index b83da7b..c999369 100644
--- a/services/mounttable/btmtd/internal/node.go
+++ b/services/mounttable/btmtd/internal/node.go
@@ -7,6 +7,7 @@
 import (
 	"encoding/hex"
 	"encoding/json"
+	"fmt"
 	"hash/fnv"
 	"io/ioutil"
 	"path"
@@ -38,6 +39,7 @@
 
 type mtNode struct {
 	bt             *BigTable
+	id             string
 	name           string
 	sticky         bool
 	creationTime   bigtable.Timestamp
@@ -47,7 +49,7 @@
 	mountFlags     mtFlags
 	servers        []naming.MountedServer
 	expiredServers []string
-	children       []string
+	children       []child
 }
 
 type mtFlags struct {
@@ -55,6 +57,30 @@
 	Leaf bool `json:"leaf,omitempty"`
 }
 
+type child string
+
+func newChild(id, name string) (child, error) {
+	if len(id) != 8 {
+		return child(""), fmt.Errorf("expected id to be 8 characters: %q", id)
+	}
+	return child(id + name), nil
+}
+
+func childFromCol(col string) (child, error) {
+	if len(col) <= 8 {
+		return child(""), fmt.Errorf("expected col to be more than 8 characters: %q", col)
+	}
+	return child(col), nil
+}
+
+func (c child) id() string {
+	return string(c)[:8]
+}
+
+func (c child) name() string {
+	return string(c)[8:]
+}
+
 func longTimeout(ctx *context.T) (*context.T, func()) {
 	return context.WithTimeout(v23.GetBackgroundContext(ctx), time.Hour)
 }
@@ -91,6 +117,8 @@
 	for _, i := range row[metadataFamily] {
 		col := strings.TrimPrefix(i.Column, metadataFamily+":")
 		switch col {
+		case idColumn:
+			n.id = string(i.Value)
 		case stickyColumn:
 			n.sticky = true
 		case versionColumn:
@@ -122,18 +150,25 @@
 			Deadline: vdltime.Deadline{deadline},
 		})
 	}
-	n.children = make([]string, 0, len(row[childrenFamily]))
+	n.children = make([]child, 0, len(row[childrenFamily]))
 	for _, i := range row[childrenFamily] {
-		child := strings.TrimPrefix(i.Column, childrenFamily+":")
-		n.children = append(n.children, child)
+		childCol := strings.TrimPrefix(i.Column, childrenFamily+":")
+		if child, err := childFromCol(childCol); err != nil {
+			ctx.Errorf("childFromCol(%q) failed: %v", childCol, err)
+		} else {
+			n.children = append(n.children, child)
+		}
 	}
 	return n
 }
 
-func (n *mtNode) createChild(ctx *context.T, child string, perms access.Permissions, creator string, limit int64) (*mtNode, error) {
-	ts := n.bt.now()
+func (n *mtNode) createChild(ctx *context.T, childName string, perms access.Permissions, creator string, limit int64) (*mtNode, error) {
+	child, err := newChild(n.version, childName)
+	if err != nil {
+		return nil, err
+	}
 	mut := bigtable.NewMutation()
-	mut.Set(childrenFamily, child, ts, []byte{1})
+	mut.Set(childrenFamily, string(child), bigtable.ServerTime, []byte{1})
 	if err := n.mutate(ctx, mut, false); err != nil {
 		return nil, err
 	}
@@ -145,25 +180,44 @@
 	//  - the child is created again, or
 	//  - the parent is forcibly deleted with Delete().
 
-	childName := naming.Join(n.name, child)
+	childFullName := naming.Join(n.name, childName)
 	longCtx, cancel := longTimeout(ctx)
 	defer cancel()
-	if err := n.bt.createRow(longCtx, childName, perms, creator, ts, limit); err != nil {
+	if err := n.bt.createRow(longCtx, childFullName, perms, creator, child, limit); err != nil {
 		mut = bigtable.NewMutation()
-		mut.DeleteTimestampRange(childrenFamily, child, ts, n.bt.timeNext(ts))
+		mut.DeleteCellsInColumn(childrenFamily, string(child))
 		if err := n.bt.apply(ctx, rowKey(n.name), mut); err != nil {
-			ctx.Errorf("Failed to delete child reference. Parent=%q Child=%q Err=%v", n.name, child, err)
+			ctx.Errorf("Failed to delete child reference. Parent=%q Col=%q Err=%v", n.name, string(child), err)
 		}
 		return nil, err
 	}
-	n, err := getNode(ctx, n.bt, childName)
+	// Delete any stale references to the child that we just successfully
+	// created.
+	mut = nil
+	for _, c := range n.children {
+		if c.name() != childName {
+			continue
+		}
+		if mut == nil {
+			mut = bigtable.NewMutation()
+		}
+		mut.DeleteCellsInColumn(childrenFamily, string(c))
+	}
+	if mut != nil {
+		if err := n.bt.apply(ctx, rowKey(n.name), mut); err != nil {
+			ctx.Errorf("Failed to delete child reference. Parent=%q Err=%v", n.name, err)
+		}
+	}
+
+	// Return the new child node.
+	cn, err := getNode(ctx, n.bt, childFullName)
 	if err != nil {
 		return nil, err
 	}
-	if n == nil {
-		return nil, verror.New(errConcurrentAccess, ctx, childName)
+	if cn == nil {
+		return nil, verror.New(errConcurrentAccess, ctx, childFullName)
 	}
-	return n, nil
+	return cn, nil
 }
 
 func (n *mtNode) mount(ctx *context.T, server string, deadline time.Time, flags naming.MountFlag, limit int64) error {
@@ -293,7 +347,7 @@
 	// terms of memory. A smarter, but slower, approach would be to walk
 	// the tree without holding on to all the node data.
 	for _, c := range n.children {
-		cn, err := getNode(ctx, n.bt, naming.Join(n.name, c))
+		cn, err := getNode(ctx, n.bt, naming.Join(n.name, c.name()))
 		if err != nil {
 			return err
 		}
@@ -302,7 +356,7 @@
 			// exist. It could be that it is being created or
 			// deleted concurrently. To be sure, we have to create
 			// it before deleting it.
-			if cn, err = n.createChild(ctx, c, n.permissions, "", 0); err != nil {
+			if cn, err = n.createChild(ctx, c.name(), n.permissions, "", 0); err != nil {
 				return err
 			}
 		}
@@ -327,10 +381,7 @@
 	// Delete from parent node.
 	parent, child := path.Split(n.name)
 	mut = bigtable.NewMutation()
-	// DeleteTimestampRange deletes the cells whose timestamp is in the
-	// half open range [start,end). We need to delete the cell with
-	// timestamp n.creationTime (and any older ones).
-	mut.DeleteTimestampRange(childrenFamily, child, 0, n.bt.timeNext(n.creationTime))
+	mut.DeleteCellsInColumn(childrenFamily, n.id+child)
 
 	longCtx, cancel := longTimeout(ctx)
 	defer cancel()
@@ -363,11 +414,11 @@
 
 func (n *mtNode) mutate(ctx *context.T, mut *bigtable.Mutation, delete bool) error {
 	if !delete {
-		v, err := strconv.ParseUint(n.version, 10, 64)
+		v, err := strconv.ParseUint(n.version, 16, 32)
 		if err != nil {
 			return err
 		}
-		newVersion := strconv.FormatUint(v+1, 10)
+		newVersion := fmt.Sprintf("%08x", uint32(v)+1)
 		mut.Set(metadataFamily, versionColumn, bigtable.ServerTime, []byte(newVersion))
 	}
 
@@ -426,11 +477,14 @@
 	}
 	sort.Strings(sortedNodes)
 
-	ts := bt.now()
 	for _, node := range sortedNodes {
 		perms := nodes[node]
 		if node == "" {
-			if err := bt.createRow(ctx, "", perms, "", ts, 0); err != nil {
+			child, err := newChild("ROOTNODE", "")
+			if err != nil {
+				return err
+			}
+			if err := bt.createRow(ctx, "", perms, "", child, 0); err != nil {
 				return err
 			}
 			continue
diff --git a/services/mounttable/btmtd/internal/node_test.go b/services/mounttable/btmtd/internal/node_test.go
index 711d45d..aafbc1a 100644
--- a/services/mounttable/btmtd/internal/node_test.go
+++ b/services/mounttable/btmtd/internal/node_test.go
@@ -65,7 +65,7 @@
 				t.Errorf("[%d] node doesn't exist after createChild", i)
 				return false
 			}
-			if len(m.children) != 1 || m.children[0] != "Y" {
+			if len(m.children) != 1 || m.children[0].name() != "Y" {
 				t.Errorf("[%d] unexpected children after createChild: %v", i, m.children)
 				return false
 			}