services/mounttable/btmtd: Add row id
Before this change, we implicitly relied on createChild() never being
called concurrently within the same millisecond. In reality, this can
happen and does happen once in a while in the tests.
With this change, each row has its own 4-byte ID and the parent uses
both the child's ID and its name to reference it.
(This is not backward compatible)
Change-Id: Ib0334a25caa98307cf1eeed4932454c5721310cc
diff --git a/services/mounttable/btmtd/README.md b/services/mounttable/btmtd/README.md
index 8de7315..7801f37 100644
--- a/services/mounttable/btmtd/README.md
+++ b/services/mounttable/btmtd/README.md
@@ -19,6 +19,7 @@
The table has three column families:
* Metadata `m`: used to store information about the row:
+ * ID `i`: A 4-byte ID for the node. Doesn't have to be globally unique.
* Version `v`: The version changes every time the row is mutated. It is
used to detect conflicts related to concurrent access.
* Creator `c`: The cell's value is the name of its creator and its
@@ -34,11 +35,13 @@
Example:
-| Key | Version | Creator | Sticky | Permissions | Mounted Server... | Child... |
-| --- | --- | --- | --- | --- | --- | --- |
-| 540f1a56/ | 54321 | user (ts) | 1 | {"Admin":... | | foo (ts1) |
-| 1234abcd/foo | 123 | user (ts1) | | {"Admin":... | | bar (ts2) |
-| 46d523e3/foo/bar | 5436 | user (ts2) | | {"Admin":... | /example.com:123 (deadline) | |
+| Key | ID | Version | Creator | Sticky | Permissions | Mounted Server... | Child... |
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| 540f1a56/ | id1 | 54321 | user | 1 | {"Admin":... | | (id2)foo |
+| 1234abcd/foo | id2 | 123 | user | | {"Admin":... | | (id3)bar |
+| 46d523e3/foo/bar | id3 | 5436 | user | | {"Admin":... | /example.com:123 (deadline) | |
+
+Counters are stored in another table, one row with one column per counter.
## Mutations
diff --git a/services/mounttable/btmtd/internal/bt.go b/services/mounttable/btmtd/internal/bt.go
index 3769523..9396673 100644
--- a/services/mounttable/btmtd/internal/bt.go
+++ b/services/mounttable/btmtd/internal/bt.go
@@ -9,7 +9,6 @@
"fmt"
"io/ioutil"
"math/rand"
- "strconv"
"strings"
"time"
@@ -37,6 +36,7 @@
serversFamily = "s"
childrenFamily = "c"
+ idColumn = "i"
creatorColumn = "c"
permissionsColumn = "p"
stickyColumn = "s"
@@ -162,7 +162,11 @@
}
perms := make(access.Permissions)
perms.Add(security.AllPrincipals, string(v23mt.Admin))
- return b.createRow(ctx, "", perms, "", b.now(), 0)
+ child, err := newChild("ROOTNODE", "")
+ if err != nil {
+ return err
+ }
+ return b.createRow(ctx, "", perms, "", child, 0)
}
func (b *BigTable) timeFloor(t bigtable.Timestamp) bigtable.Timestamp {
@@ -174,14 +178,6 @@
return (t / 1000) * 1000
}
-func (b *BigTable) timeNext(t bigtable.Timestamp) bigtable.Timestamp {
- return (t/1000 + 1) * 1000
-}
-
-func (b *BigTable) now() bigtable.Timestamp {
- return b.timeFloor(bigtable.Now())
-}
-
func (b *BigTable) time(t time.Time) bigtable.Timestamp {
return b.timeFloor(bigtable.Time(t))
}
@@ -227,8 +223,12 @@
}
fmt.Printf(" flags: %+v", n.mountFlags)
}
- if len(n.children) > 0 {
- fmt.Printf(" children: [%s]", strings.Join(n.children, " "))
+ children := make([]string, len(n.children))
+ for i, c := range n.children {
+ children[i] = c.name()
+ }
+ if len(children) > 0 {
+ fmt.Printf(" children: [%s]", strings.Join(children, " "))
}
fmt.Println()
return true
@@ -342,7 +342,7 @@
return row, err
}
-func (b *BigTable) createRow(ctx *context.T, name string, perms access.Permissions, creator string, ts bigtable.Timestamp, limit int64) error {
+func (b *BigTable) createRow(ctx *context.T, name string, perms access.Permissions, creator string, ch child, limit int64) error {
jsonPerms, err := json.Marshal(perms)
if err != nil {
return err
@@ -360,9 +360,10 @@
}
}()
mut := bigtable.NewMutation()
- mut.Set(metadataFamily, creatorColumn, ts, []byte(creator))
+ mut.Set(metadataFamily, idColumn, bigtable.ServerTime, []byte(ch.id()))
+ mut.Set(metadataFamily, creatorColumn, bigtable.ServerTime, []byte(creator))
mut.Set(metadataFamily, permissionsColumn, bigtable.ServerTime, jsonPerms)
- mut.Set(metadataFamily, versionColumn, bigtable.ServerTime, []byte(strconv.FormatUint(uint64(rand.Uint32()), 10)))
+ mut.Set(metadataFamily, versionColumn, bigtable.ServerTime, []byte(fmt.Sprintf("%08x", rand.Uint32())))
filter := bigtable.ChainFilters(bigtable.FamilyFilter(metadataFamily), bigtable.ColumnFilter(creatorColumn))
condMut := bigtable.NewCondMutation(filter, nil, mut)
diff --git a/services/mounttable/btmtd/internal/mounttable.go b/services/mounttable/btmtd/internal/mounttable.go
index e7c1406..e3f1388 100644
--- a/services/mounttable/btmtd/internal/mounttable.go
+++ b/services/mounttable/btmtd/internal/mounttable.go
@@ -237,10 +237,10 @@
}
matcher, left := state.g.Head(), state.g.Tail()
for _, child := range n.children {
- if matcher.Match(child) {
+ if matcher.Match(child.name()) {
// TODO(rthellend): We should protect against large query results
// that could blow up memory usage.
- queue = append(queue, gState{naming.Join(state.name, child), left})
+ queue = append(queue, gState{naming.Join(state.name, child.name()), left})
}
}
}
diff --git a/services/mounttable/btmtd/internal/node.go b/services/mounttable/btmtd/internal/node.go
index b83da7b..c999369 100644
--- a/services/mounttable/btmtd/internal/node.go
+++ b/services/mounttable/btmtd/internal/node.go
@@ -7,6 +7,7 @@
import (
"encoding/hex"
"encoding/json"
+ "fmt"
"hash/fnv"
"io/ioutil"
"path"
@@ -38,6 +39,7 @@
type mtNode struct {
bt *BigTable
+ id string
name string
sticky bool
creationTime bigtable.Timestamp
@@ -47,7 +49,7 @@
mountFlags mtFlags
servers []naming.MountedServer
expiredServers []string
- children []string
+ children []child
}
type mtFlags struct {
@@ -55,6 +57,30 @@
Leaf bool `json:"leaf,omitempty"`
}
+type child string
+
+func newChild(id, name string) (child, error) {
+ if len(id) != 8 {
+ return child(""), fmt.Errorf("expected id to be 8 characters: %q", id)
+ }
+ return child(id + name), nil
+}
+
+func childFromCol(col string) (child, error) {
+ if len(col) <= 8 {
+ return child(""), fmt.Errorf("expected col to be more than 8 characters: %q", col)
+ }
+ return child(col), nil
+}
+
+func (c child) id() string {
+ return string(c)[:8]
+}
+
+func (c child) name() string {
+ return string(c)[8:]
+}
+
func longTimeout(ctx *context.T) (*context.T, func()) {
return context.WithTimeout(v23.GetBackgroundContext(ctx), time.Hour)
}
@@ -91,6 +117,8 @@
for _, i := range row[metadataFamily] {
col := strings.TrimPrefix(i.Column, metadataFamily+":")
switch col {
+ case idColumn:
+ n.id = string(i.Value)
case stickyColumn:
n.sticky = true
case versionColumn:
@@ -122,18 +150,25 @@
Deadline: vdltime.Deadline{deadline},
})
}
- n.children = make([]string, 0, len(row[childrenFamily]))
+ n.children = make([]child, 0, len(row[childrenFamily]))
for _, i := range row[childrenFamily] {
- child := strings.TrimPrefix(i.Column, childrenFamily+":")
- n.children = append(n.children, child)
+ childCol := strings.TrimPrefix(i.Column, childrenFamily+":")
+ if child, err := childFromCol(childCol); err != nil {
+ ctx.Errorf("childFromCol(%q) failed: %v", childCol, err)
+ } else {
+ n.children = append(n.children, child)
+ }
}
return n
}
-func (n *mtNode) createChild(ctx *context.T, child string, perms access.Permissions, creator string, limit int64) (*mtNode, error) {
- ts := n.bt.now()
+func (n *mtNode) createChild(ctx *context.T, childName string, perms access.Permissions, creator string, limit int64) (*mtNode, error) {
+ child, err := newChild(n.version, childName)
+ if err != nil {
+ return nil, err
+ }
mut := bigtable.NewMutation()
- mut.Set(childrenFamily, child, ts, []byte{1})
+ mut.Set(childrenFamily, string(child), bigtable.ServerTime, []byte{1})
if err := n.mutate(ctx, mut, false); err != nil {
return nil, err
}
@@ -145,25 +180,44 @@
// - the child is created again, or
// - the parent is forcibly deleted with Delete().
- childName := naming.Join(n.name, child)
+ childFullName := naming.Join(n.name, childName)
longCtx, cancel := longTimeout(ctx)
defer cancel()
- if err := n.bt.createRow(longCtx, childName, perms, creator, ts, limit); err != nil {
+ if err := n.bt.createRow(longCtx, childFullName, perms, creator, child, limit); err != nil {
mut = bigtable.NewMutation()
- mut.DeleteTimestampRange(childrenFamily, child, ts, n.bt.timeNext(ts))
+ mut.DeleteCellsInColumn(childrenFamily, string(child))
if err := n.bt.apply(ctx, rowKey(n.name), mut); err != nil {
- ctx.Errorf("Failed to delete child reference. Parent=%q Child=%q Err=%v", n.name, child, err)
+ ctx.Errorf("Failed to delete child reference. Parent=%q Col=%q Err=%v", n.name, string(child), err)
}
return nil, err
}
- n, err := getNode(ctx, n.bt, childName)
+ // Delete any stale references to the child that we just successfully
+ // created.
+ mut = nil
+ for _, c := range n.children {
+ if c.name() != childName {
+ continue
+ }
+ if mut == nil {
+ mut = bigtable.NewMutation()
+ }
+ mut.DeleteCellsInColumn(childrenFamily, string(c))
+ }
+ if mut != nil {
+ if err := n.bt.apply(ctx, rowKey(n.name), mut); err != nil {
+ ctx.Errorf("Failed to delete child reference. Parent=%q Err=%v", n.name, err)
+ }
+ }
+
+ // Return the new child node.
+ cn, err := getNode(ctx, n.bt, childFullName)
if err != nil {
return nil, err
}
- if n == nil {
- return nil, verror.New(errConcurrentAccess, ctx, childName)
+ if cn == nil {
+ return nil, verror.New(errConcurrentAccess, ctx, childFullName)
}
- return n, nil
+ return cn, nil
}
func (n *mtNode) mount(ctx *context.T, server string, deadline time.Time, flags naming.MountFlag, limit int64) error {
@@ -293,7 +347,7 @@
// terms of memory. A smarter, but slower, approach would be to walk
// the tree without holding on to all the node data.
for _, c := range n.children {
- cn, err := getNode(ctx, n.bt, naming.Join(n.name, c))
+ cn, err := getNode(ctx, n.bt, naming.Join(n.name, c.name()))
if err != nil {
return err
}
@@ -302,7 +356,7 @@
// exist. It could be that it is being created or
// deleted concurrently. To be sure, we have to create
// it before deleting it.
- if cn, err = n.createChild(ctx, c, n.permissions, "", 0); err != nil {
+ if cn, err = n.createChild(ctx, c.name(), n.permissions, "", 0); err != nil {
return err
}
}
@@ -327,10 +381,7 @@
// Delete from parent node.
parent, child := path.Split(n.name)
mut = bigtable.NewMutation()
- // DeleteTimestampRange deletes the cells whose timestamp is in the
- // half open range [start,end). We need to delete the cell with
- // timestamp n.creationTime (and any older ones).
- mut.DeleteTimestampRange(childrenFamily, child, 0, n.bt.timeNext(n.creationTime))
+ mut.DeleteCellsInColumn(childrenFamily, n.id+child)
longCtx, cancel := longTimeout(ctx)
defer cancel()
@@ -363,11 +414,11 @@
func (n *mtNode) mutate(ctx *context.T, mut *bigtable.Mutation, delete bool) error {
if !delete {
- v, err := strconv.ParseUint(n.version, 10, 64)
+ v, err := strconv.ParseUint(n.version, 16, 32)
if err != nil {
return err
}
- newVersion := strconv.FormatUint(v+1, 10)
+ newVersion := fmt.Sprintf("%08x", uint32(v)+1)
mut.Set(metadataFamily, versionColumn, bigtable.ServerTime, []byte(newVersion))
}
@@ -426,11 +477,14 @@
}
sort.Strings(sortedNodes)
- ts := bt.now()
for _, node := range sortedNodes {
perms := nodes[node]
if node == "" {
- if err := bt.createRow(ctx, "", perms, "", ts, 0); err != nil {
+ child, err := newChild("ROOTNODE", "")
+ if err != nil {
+ return err
+ }
+ if err := bt.createRow(ctx, "", perms, "", child, 0); err != nil {
return err
}
continue
diff --git a/services/mounttable/btmtd/internal/node_test.go b/services/mounttable/btmtd/internal/node_test.go
index 711d45d..aafbc1a 100644
--- a/services/mounttable/btmtd/internal/node_test.go
+++ b/services/mounttable/btmtd/internal/node_test.go
@@ -65,7 +65,7 @@
t.Errorf("[%d] node doesn't exist after createChild", i)
return false
}
- if len(m.children) != 1 || m.children[0] != "Y" {
+ if len(m.children) != 1 || m.children[0].name() != "Y" {
t.Errorf("[%d] unexpected children after createChild: %v", i, m.children)
return false
}