store: implementing the trie and replacing the memstore impl.

Change-Id: Ib862b9ca09740290270ced4b50aad3b940158cfa
diff --git a/services/syncbase/store/leveldb/stream.go b/services/syncbase/store/leveldb/stream.go
index 98c3c2f..2d44f73 100644
--- a/services/syncbase/store/leveldb/stream.go
+++ b/services/syncbase/store/leveldb/stream.go
@@ -44,7 +44,7 @@
 	s := &stream{
 		node:  store.NewResourceNode(),
 		cIter: cIter,
-		limit: limit,
+		limit: store.CopyBytes(nil, limit),
 	}
 	parent.AddChild(s.node, func() {
 		s.Cancel()
diff --git a/services/syncbase/store/memstore/snapshot.go b/services/syncbase/store/memstore/snapshot.go
index 557b4dd..f6cc7d0 100644
--- a/services/syncbase/store/memstore/snapshot.go
+++ b/services/syncbase/store/memstore/snapshot.go
@@ -9,27 +9,24 @@
 
 	"v.io/v23/verror"
 	"v.io/x/ref/services/syncbase/store"
+	"v.io/x/ref/services/syncbase/store/ptrie"
 )
 
 type snapshot struct {
 	store.SnapshotSpecImpl
 	mu   sync.Mutex
 	node *store.ResourceNode
-	data map[string][]byte
+	data *ptrie.T
 	err  error
 }
 
 var _ store.Snapshot = (*snapshot)(nil)
 
 // Assumes st lock is held.
-func newSnapshot(st *memstore, parent *store.ResourceNode) *snapshot {
-	dataCopy := make(map[string][]byte, len(st.data))
-	for k, v := range st.data {
-		dataCopy[k] = v
-	}
+func newSnapshot(data *ptrie.T, parent *store.ResourceNode) *snapshot {
 	s := &snapshot{
 		node: store.NewResourceNode(),
-		data: dataCopy,
+		data: data,
 	}
 	parent.AddChild(s.node, func() {
 		s.Abort()
@@ -56,11 +53,11 @@
 	if s.err != nil {
 		return valbuf, store.ConvertError(s.err)
 	}
-	value, ok := s.data[string(key)]
-	if !ok {
+	value := s.data.Get(key)
+	if value == nil {
 		return valbuf, verror.New(store.ErrUnknownKey, nil, string(key))
 	}
-	return store.CopyBytes(valbuf, value), nil
+	return store.CopyBytes(valbuf, value.([]byte)), nil
 }
 
 // Scan implements the store.StoreReader interface.
@@ -70,5 +67,5 @@
 	if s.err != nil {
 		return &store.InvalidStream{Error: s.err}
 	}
-	return newStream(s, s.node, start, limit)
+	return newStream(s.data, s.node, start, limit)
 }
diff --git a/services/syncbase/store/memstore/store.go b/services/syncbase/store/memstore/store.go
index 699c2c5..6639e8d 100644
--- a/services/syncbase/store/memstore/store.go
+++ b/services/syncbase/store/memstore/store.go
@@ -12,21 +12,22 @@
 
 	"v.io/v23/verror"
 	"v.io/x/ref/services/syncbase/store"
+	"v.io/x/ref/services/syncbase/store/ptrie"
 	"v.io/x/ref/services/syncbase/store/transactions"
 )
 
 type memstore struct {
 	mu   sync.Mutex
 	node *store.ResourceNode
-	data map[string][]byte
+	data *ptrie.T
 	err  error
 }
 
 // New creates a new memstore.
 func New() store.Store {
 	return transactions.Wrap(&memstore{
-		data: map[string][]byte{},
 		node: store.NewResourceNode(),
+		data: ptrie.New(true),
 	})
 }
 
@@ -49,11 +50,11 @@
 	if st.err != nil {
 		return valbuf, store.ConvertError(st.err)
 	}
-	value, ok := st.data[string(key)]
-	if !ok {
+	value := st.data.Get(key)
+	if value == nil {
 		return valbuf, verror.New(store.ErrUnknownKey, nil, string(key))
 	}
-	return store.CopyBytes(valbuf, value), nil
+	return store.CopyBytes(valbuf, value.([]byte)), nil
 }
 
 // Scan implements the store.StoreReader interface.
@@ -63,8 +64,7 @@
 	if st.err != nil {
 		return &store.InvalidStream{Error: st.err}
 	}
-	// TODO(sadovsky): Close snapshot once stream is closed or canceled.
-	return newSnapshot(st, st.node).Scan(start, limit)
+	return newStream(st.data.Copy(), st.node, start, limit)
 }
 
 // NewSnapshot implements the store.Store interface.
@@ -74,7 +74,7 @@
 	if st.err != nil {
 		return &store.InvalidSnapshot{Error: st.err}
 	}
-	return newSnapshot(st, st.node)
+	return newSnapshot(st.data.Copy(), st.node)
 }
 
 // WriteBatch implements the transactions.BatchStore interface.
@@ -87,9 +87,9 @@
 	for _, write := range batch {
 		switch write.T {
 		case transactions.PutOp:
-			st.data[string(write.Key)] = write.Value
+			st.data.Put(write.Key, store.CopyBytes(nil, write.Value))
 		case transactions.DeleteOp:
-			delete(st.data, string(write.Key))
+			st.data.Delete(write.Key)
 		default:
 			panic(fmt.Sprintf("unknown write operation type: %v", write.T))
 		}
diff --git a/services/syncbase/store/memstore/stream.go b/services/syncbase/store/memstore/stream.go
index 5f776d9..d7d8ba6 100644
--- a/services/syncbase/store/memstore/stream.go
+++ b/services/syncbase/store/memstore/stream.go
@@ -5,39 +5,27 @@
 package memstore
 
 import (
-	"sort"
 	"sync"
 
 	"v.io/v23/verror"
 	"v.io/x/ref/services/syncbase/store"
+	"v.io/x/ref/services/syncbase/store/ptrie"
 )
 
 type stream struct {
-	mu        sync.Mutex
-	node      *store.ResourceNode
-	sn        *snapshot
-	keys      []string
-	currIndex int
-	currKey   *string
-	err       error
-	done      bool
+	mu      sync.Mutex
+	node    *store.ResourceNode
+	pstream *ptrie.Stream
+	err     error
+	done    bool
 }
 
 var _ store.Stream = (*stream)(nil)
 
-func newStream(sn *snapshot, parent *store.ResourceNode, start, limit []byte) *stream {
-	keys := []string{}
-	for k := range sn.data {
-		if k >= string(start) && (len(limit) == 0 || k < string(limit)) {
-			keys = append(keys, k)
-		}
-	}
-	sort.Strings(keys)
+func newStream(data *ptrie.T, parent *store.ResourceNode, start, limit []byte) *stream {
 	s := &stream{
-		node:      store.NewResourceNode(),
-		sn:        sn,
-		keys:      keys,
-		currIndex: -1,
+		node:    store.NewResourceNode(),
+		pstream: data.Scan(start, limit),
 	}
 	parent.AddChild(s.node, func() {
 		s.Cancel()
@@ -49,16 +37,11 @@
 func (s *stream) Advance() bool {
 	s.mu.Lock()
 	defer s.mu.Unlock()
-	s.currKey = nil
 	if s.done {
 		return false
 	}
-	s.currIndex++
-	if s.currIndex < len(s.keys) {
-		s.currKey = &s.keys[s.currIndex]
-	} else {
-		s.done = true
-		s.currKey = nil
+	if s.done = !s.pstream.Advance(); s.done {
+		s.node.Close()
 	}
 	return !s.done
 }
@@ -67,20 +50,14 @@
 func (s *stream) Key(keybuf []byte) []byte {
 	s.mu.Lock()
 	defer s.mu.Unlock()
-	if s.currKey == nil {
-		panic("nothing staged")
-	}
-	return store.CopyBytes(keybuf, []byte(*s.currKey))
+	return s.pstream.Key(keybuf)
 }
 
 // Value implements the store.Stream interface.
 func (s *stream) Value(valbuf []byte) []byte {
 	s.mu.Lock()
 	defer s.mu.Unlock()
-	if s.currKey == nil {
-		panic("nothing staged")
-	}
-	return store.CopyBytes(valbuf, s.sn.data[*s.currKey])
+	return store.CopyBytes(valbuf, s.pstream.Value().([]byte))
 }
 
 // Err implements the store.Stream interface.
@@ -93,11 +70,10 @@
 // Cancel implements the store.Stream interface.
 func (s *stream) Cancel() {
 	s.mu.Lock()
-	defer s.mu.Unlock()
-	if s.done {
-		return
+	if !s.done {
+		s.done = true
+		s.node.Close()
+		s.err = verror.New(verror.ErrCanceled, nil, store.ErrMsgCanceledStream)
 	}
-	s.done = true
-	s.node.Close()
-	s.err = verror.New(verror.ErrCanceled, nil, store.ErrMsgCanceledStream)
+	s.mu.Unlock()
 }
diff --git a/services/syncbase/store/ptrie/ptrie.go b/services/syncbase/store/ptrie/ptrie.go
new file mode 100644
index 0000000..a3e5be4
--- /dev/null
+++ b/services/syncbase/store/ptrie/ptrie.go
@@ -0,0 +1,324 @@
+// Copyright 2015 The Vanadium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package ptrie provides a ptrie to store a mapping from bit strings to
+// arbitrary values. Ptrie exposes a simple interface: Get(key),
+// Put(key, value), Delete(key), and Scan(start, limit). Conceptually a ptrie
+// can be thought of as a map[[]byte]interface{}, designed to support fast
+// range queries and immutable views.
+//
+// For performance reasons, bit strings are represented as byte slices.
+// The ptrie consists of three concepts: a binary trie, path contraction and
+// copy-on-write modifications.
+//
+// 1) A binary trie consists of nodes and arcs. Each node has a value and two
+// children: 0 and 1. The value and the children might be nil. An arc connects
+// a node with its child. A node N has a value V and S is the bit string of
+// the path from the root to N iff the trie maps S to V. Using this rule we can
+// build maps and sets. For example, a set of {'b', 'c'} can be represented as:
+// 'b': 0110 0010     o
+// 'c': 0110 0011   0/
+//                  1\
+//                   1\
+//                   0/
+//                  0/
+//                 0/
+//                 1\
+//                   o
+//                 0/1\
+//                 o   o
+// This trie has 10 nodes. This representation is not efficient.
+// To reduce the number of nodes, we use the path contraction technique.
+//
+// 2) Path contraction. If a path consists only of nodes that have one child
+// and don't have a value, then the path can be replaced with one arc.
+// The new arc has the whole bit string written on the path. The example above
+// becomes smaller:            o
+//                     0110001/
+//                           o
+//                         0/1\
+//                         o   o
+// This structure is stored in memory in a slightly different way. The trie consists
+// of nodes, each node has a value and two children. Each non-nil child is
+// a triple: the child node, the bit string written on the arc and the length of
+// the bit string. For convenience, bits in the bit string are aligned so that
+// the bit string might be a sub slice of a bit string representing a path from
+// a root of the trie to the child.
+//
+// 3) Copy-on-write modifications. In order to support immutable views of
+// the data in the ptrie, the Put() and the Delete() functions have the makeCopy
+// flag. If the makeCopy flag is true, then the algorithm doesn't modify the
+// current ptrie, but it returns a new ptrie with some nodes reused from
+// the current one.
+package ptrie
+
+import (
+	"v.io/x/ref/services/syncbase/store"
+)
+
+// T represents a ptrie.
+type T struct {
+	root        *pnode
+	copyOnWrite bool
+}
+
+// New returns a new empty ptrie. If copyOnWrite is true, then the new ptrie
+// performs copy-on-write modifications for Put/Delete operations and it is
+// allowed to make a copy of the ptrie by calling Copy().
+func New(copyOnWrite bool) *T {
+	return &T{copyOnWrite: copyOnWrite}
+}
+
+// Put maps the given value to the given key. The value is not copied, so
+// the client must keep the value unchanged.
+func (t *T) Put(key []byte, value interface{}) {
+	t.root = t.root.put(key, value, t.copyOnWrite)
+}
+
+// Get returns a value mapped to the given key. Get returns nil if the given
+// key has no mapped value. The client must not modify the returned value as
+// the returned value points directly to the value stored in the ptrie.
+func (t *T) Get(key []byte) interface{} {
+	return t.root.get(key)
+}
+
+// Delete removes mapping to the given key.
+func (t *T) Delete(key []byte) {
+	t.root = t.root.delete(key, t.copyOnWrite)
+}
+
+// Scan returns all key-value pairs with keys in range [start, limit).
+// If limit is "", all key-value pairs with keys >= start are included.
+func (t *T) Scan(start, limit []byte) *Stream {
+	return t.root.Scan(start, limit)
+}
+
+// Copy returns a copy of the ptrie. This operation is only allowed if the ptrie
+// was created with the copyOnWrite flag. Copy is a very fast operation since
+// it just copies the pointer to the root of the ptrie.
+// Panics if the ptrie was created with copyOnWrite=false.
+func (t *T) Copy() *T {
+	if !t.copyOnWrite {
+		panic("the ptrie was not created in persistent mode")
+	}
+	return &T{
+		root:        t.root,
+		copyOnWrite: true,
+	}
+}
+
+// pnode represents a node in the ptrie.
+type pnode struct {
+	value interface{}
+	child [2]*pchild
+}
+
+// pchild represents a child of a node in a ptrie.
+type pchild struct {
+	node   *pnode
+	bitstr []byte
+	bitlen uint32
+}
+
+// put maps the given value to the given key, assuming the given node is
+// the root of the ptrie. The value is not copied, so the client must keep
+// the value unchanged. Put returns the new root of the ptrie.
+// The client must not modify the returned value as the returned value points
+// directly to the value stored in the ptrie.
+//
+// If the makeCopy flag is true, then the Put performs a copy-on-write
+// modification.
+//
+// A nil node is treated as an empty ptrie.
+func (node *pnode) put(key []byte, value interface{}, makeCopy bool) *pnode {
+	if value == nil {
+		return node.delete(key, makeCopy)
+	}
+	if node == nil {
+		node = &pnode{}
+	}
+	return putInternal(node, 0, key, value, makeCopy)
+}
+
+// get returns a value mapped to the given key, assuming the given node is
+// the root of the ptrie. Get returns nil if the given key has no mapped value.
+//
+// A nil node is treated as an empty ptrie.
+func (node *pnode) get(key []byte) interface{} {
+	if node == nil {
+		return nil
+	}
+	return getInternal(node, 0, key)
+}
+
+// delete removes mapping to the given key, assuming the given node is
+// the root of the ptrie. Delete returns the new root of the ptrie.
+//
+// If the makeCopy flag is true, then the Delete performs a copy-on-write
+// modification.
+//
+// A nil node is treated as an empty ptrie.
+func (node *pnode) delete(key []byte, makeCopy bool) *pnode {
+	if node == nil {
+		return nil
+	}
+	newNode, _ := deleteInternal(node, 0, key, makeCopy)
+	return newNode
+}
+
+// putInternal does a DFS through the ptrie to find a node corresponding to
+// the key and updates the value.
+//
+// Invariant: the first bitIndex bits of the key represent the path from
+// the root to the current node.
+func putInternal(node *pnode, bitIndex uint32, key []byte, value interface{}, makeCopy bool) *pnode {
+	if makeCopy {
+		node = copyNode(node)
+	}
+	if bitlen(key) == bitIndex {
+		// The node corresponding to the key is found, update the value.
+		node.value = value
+		return node
+	}
+	// Pick the appropriate child and check that         o - node
+	// the bit string of the path to the child node       \
+	// matches the corresponding substring of the key.     ?
+	// If not, then we need to insert a node              / \
+	// in the middle of the path to the child.           ?   o - child.node
+	currBit := getBit(key, bitIndex)
+	if makeCopy {
+		node.child[currBit] = copyChild(node.child[currBit])
+	}
+	child := node.child[currBit]
+	lcp := bitLCP(child, key[bitIndex>>3:], bitIndex&7)
+	if child != nil && lcp == child.bitlen {
+		// child.bitstr matches the substring of the key.
+		// Continue the DFS.
+		child.node = putInternal(child.node, bitIndex+lcp, key, value, makeCopy)
+		return node
+	}
+	// child.bitstr doesn't match the substring of the key.
+	// We need to insert a node in the middle of the path to the child.
+	//       o - node
+	//        \A
+	//         o - middleNode
+	//        / \B
+	//      C/   o - child.node
+	//      o - newChild.node
+	newChild := &pchild{
+		node:   &pnode{value: value},
+		bitstr: store.CopyBytes(nil, key[(bitIndex+lcp)>>3:]),
+		bitlen: bitlen(key) - bitIndex - lcp,
+	}
+	if child == nil {
+		// This case means that paths A and B are empty. Just attach the
+		// new child to the node.
+		node.child[currBit] = newChild
+		return node
+	}
+	// Since the child.node exists and we picked the child based on the currBit
+	// (a bit from the key), the path A is not empty.
+	// The path B also can't be empty since lcp < child.bitlen.
+	middleNode := new(pnode)
+	// Update the child of the node, i.e. the A part.
+	node.child[currBit] = &pchild{
+		node:   middleNode,
+		bitstr: store.CopyBytes(nil, child.bitstr[:((bitIndex&7)+lcp+7)>>3]),
+		bitlen: lcp,
+	}
+	// Pick the first bit on path C. Since C can be empty, we pick the first
+	// bit on B and invert it.
+	nextBit := getBit(child.bitstr, (bitIndex&7)+lcp) ^ 1
+	// Set the C part only if C is not empty.
+	if bitIndex+lcp < bitlen(key) {
+		middleNode.child[nextBit] = newChild
+	}
+	// Set the B part.
+	middleNode.child[nextBit^1] = &pchild{
+		node:   child.node,
+		bitstr: store.CopyBytes(nil, child.bitstr[((bitIndex&7)+lcp)>>3:]),
+		bitlen: child.bitlen - lcp,
+	}
+	return node
+}
+
+// getInternal does a DFS through the ptrie to find a node corresponding to
+// the key and returns the value.
+//
+// Invariant: the first bitIndex bits of the key represent the path from
+// the root to the current node.
+func getInternal(node *pnode, bitIndex uint32, key []byte) interface{} {
+	if bitlen(key) == bitIndex {
+		return node.value
+	}
+	child := node.child[getBit(key, bitIndex)]
+	lcp := bitLCP(child, key[bitIndex>>3:], bitIndex&7)
+	if child == nil || lcp != child.bitlen {
+		return nil
+	}
+	return getInternal(child.node, bitIndex+lcp, key)
+}
+
+// deleteInternal does a DFS through the ptrie to find a node corresponding to
+// the key and deletes the value. deleteInternal removes the whole subtree if
+// no nodes in the subtree have values.
+//
+// Invariant: the first bitIndex bits of the key represent the path from
+// the root to the current node.
+func deleteInternal(node *pnode, bitIndex uint32, key []byte, makeCopy bool) (newNode *pnode, deleted bool) {
+	if bitlen(key) == bitIndex {
+		// The node corresponding to the key is found.
+		if node.child[0] == nil && node.child[1] == nil {
+			return nil, true
+		}
+		if makeCopy {
+			node = copyNode(node)
+		}
+		node.value = nil
+		return node, true
+	}
+	// Pick the appropriate child and check that
+	// the bit string of the path to the child node
+	// matches the corresponding substring of the key.
+	currBit := getBit(key, bitIndex)
+	child := node.child[currBit]
+	lcp := bitLCP(child, key[bitIndex>>3:], bitIndex&7)
+	if child == nil || lcp != child.bitlen {
+		// child.bitstr doesn't match the substring of the key, so the key
+		// was not found in the ptrie.
+		return node, false
+	}
+	// Delete the key in the subtree.
+	if newNode, deleted = deleteInternal(child.node, bitIndex+lcp, key, makeCopy); !deleted {
+		return node, false
+	}
+	if makeCopy {
+		node = copyNode(node)
+	}
+	if newNode == nil {
+		// If the whole subtree was removed, just remove the child.
+		// It is possible that the node has no value and only one child. In this
+		// case the node will be contracted one step out of the recursion.
+		node.child[currBit] = nil
+		return node, true
+	}
+	if makeCopy {
+		node.child[currBit] = copyChild(node.child[currBit])
+	}
+	if newNode.value == nil && (newNode.child[0] == nil || newNode.child[1] == nil) {
+		// Contract the new node if necessary.
+		// Note: both children of the new node can't be nil since deleteInternal
+		// automatically removes empty subtrees.
+		child := newNode.child[0]
+		if child == nil {
+			child = newNode.child[1]
+		}
+		node.child[currBit].node = child.node
+		node.child[currBit].bitstr = appendBits(node.child[currBit].bitstr, (bitIndex&7)+node.child[currBit].bitlen, child.bitstr)
+		node.child[currBit].bitlen += child.bitlen
+	} else {
+		node.child[currBit].node = newNode
+	}
+	return node, true
+}
diff --git a/services/syncbase/store/ptrie/ptrie_test.go b/services/syncbase/store/ptrie/ptrie_test.go
new file mode 100644
index 0000000..e0da1dc
--- /dev/null
+++ b/services/syncbase/store/ptrie/ptrie_test.go
@@ -0,0 +1,49 @@
+// Copyright 2015 The Vanadium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// TODO(rogulenko): Add a benchmark to compare a ptrie with
+// a map[[]byte]interface{}.
+package ptrie
+
+import (
+	"testing"
+)
+
+// TestPutGetDelete verifies basic functionality of Put/Get/Delete.
+// More Put/Get/Delete/Scan tests can be found in store/memstore.
+// TODO(rogulenko): Add more tests, don't rely on tests in store/memstore.
+func TestPutGetDelete(t *testing.T) {
+	data := New(true)
+	data.Put([]byte("a"), "a")
+	data.Put([]byte("ab"), "ab")
+	if got, want := data.Get([]byte("a")).(string), "a"; got != want {
+		t.Fatalf("unexpected Get result: got %q, want %q", got, want)
+	}
+	if got, want := data.Get([]byte("ab")).(string), "ab"; got != want {
+		t.Fatalf("unexpected Get result: got %q, want %q", got, want)
+	}
+	// Verify that copy-on-write works.
+	newData := data.Copy()
+	newData.Delete([]byte("a"))
+	if got, want := data.Get([]byte("a")).(string), "a"; got != want {
+		t.Fatalf("unexpected Get result: got %q, want %q", got, want)
+	}
+	if value := newData.Get([]byte("a")); value != nil {
+		t.Fatalf("Get returned a non-nil value %v", value)
+	}
+	// Verify path contraction after Delete().
+	if newData.root.child[0].bitlen != 16 {
+		t.Fatal("path was not contracted after Delete()")
+	}
+	// Verify path contraction after Put("ac") and Delete("ac").
+	data = newData.Copy()
+	data.Put([]byte("ac"), "ac")
+	if got, want := data.Get([]byte("ac")).(string), "ac"; got != want {
+		t.Fatalf("unexpected Get result: got %q, want %q", got, want)
+	}
+	data.Delete([]byte("ab"))
+	if data.root.child[0].bitlen != 16 {
+		t.Fatal("path was not contracted after Delete()")
+	}
+}
diff --git a/services/syncbase/store/ptrie/stream.go b/services/syncbase/store/ptrie/stream.go
new file mode 100644
index 0000000..a61a7ef
--- /dev/null
+++ b/services/syncbase/store/ptrie/stream.go
@@ -0,0 +1,224 @@
+// Copyright 2015 The Vanadium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package ptrie
+
+import (
+	"bytes"
+
+	"v.io/x/ref/services/syncbase/store"
+)
+
+// Stream is a struct for iterating through a ptrie.
+// WARNING: The stream is not thread-safe.
+//
+// The best way to walk through nodes of a tree is to perform a DFS.
+// To support the Advance() method, we save the current state of the DFS
+// by capturing the DFS stack.
+type Stream struct {
+	dfsStack    []*dfsStackElement
+	limit       []byte
+	hasAdvanced bool // true iff the stream has been advanced at least once
+	hasValue    bool // true iff a value has been staged
+	key         []byte
+}
+
+type dfsStackElement struct {
+	node    *pnode
+	childID int8
+}
+
+// Scan returns all key-value pairs with keys in range [start, limit).
+// If limit is "", all key-value pairs with keys >= start are included.
+//
+// A nil node is treated as an empty ptrie.
+func (node *pnode) Scan(start, limit []byte) *Stream {
+	if node == nil {
+		node = &pnode{}
+	}
+	s := &Stream{
+		limit: store.CopyBytes(nil, limit),
+	}
+	// Locate the first key-value pair with key >= start and capture
+	// the DFS stack.
+	s.lowerBound(node, 0, start)
+	return s
+}
+
+// Advance stages an element so the client can retrieve it with Key or Value.
+// Advance returns true iff there is an element to retrieve. The client must
+// call Advance before calling Key or Value.
+func (s *Stream) Advance() bool {
+	s.hasValue = false
+	if len(s.dfsStack) == 0 {
+		return false
+	}
+	if !s.hasAdvanced {
+		s.hasAdvanced = true
+	} else {
+		s.advanceInternal()
+	}
+	s.key = s.keyFromDFSStack()
+	if len(s.dfsStack) == 0 || (len(s.limit) != 0 && bytes.Compare(s.key, s.limit) >= 0) {
+		s.dfsStack = nil
+		return false
+	}
+	s.hasValue = true
+	return true
+}
+
+// Key returns the key of the element that was staged by Advance. The returned
+// slice may be a sub-slice of keybuf if keybuf was large enough to hold the
+// entire key. Otherwise, a newly allocated slice will be returned. It is
+// valid to pass a nil keybuf.
+// Key may panic if Advance returned false or was not called at all.
+func (s *Stream) Key(keybuf []byte) []byte {
+	if !s.hasValue {
+		panic("nothing staged")
+	}
+	return store.CopyBytes(keybuf, s.key)
+}
+
+// Value returns the value of the element that was staged by Advance.
+// The client should not modify the returned value as the returned
+// value points directly to the value stored in the ptrie.
+// Value may panic if Advance returned false or was not called at all.
+func (s *Stream) Value() interface{} {
+	if !s.hasValue {
+		panic("nothing staged")
+	}
+	return s.dfsStack[len(s.dfsStack)-1].node.value
+}
+
+// advanceInternal simulates the DFS until the next node with a value is found
+// or the stream reaches the end of the ptrie.
+func (s *Stream) advanceInternal() {
+	// The code below simulates the following recursive function:
+	// func dfs(node *Node) {
+	// 	if node.Value != nil {
+	// 		// The next key-value pair is found.
+	// 	}
+	// 	for i := 0; i < 2; i++ {
+	// 		if node.child[i] != nil {
+	// 			dfs(node.child[i].node)
+	// 		}
+	// 	}
+	// }
+	for len(s.dfsStack) > 0 {
+		top := s.dfsStack[len(s.dfsStack)-1]
+		// childID can be -1, 0, or 1.
+		top.childID++
+		for top.childID <= 1 && top.node.child[top.childID] == nil {
+			top.childID++
+		}
+		if top.childID > 1 {
+			s.popFromDFSStack()
+			continue
+		}
+		child := top.node.child[top.childID].node
+		s.dfsStack = append(s.dfsStack, &dfsStackElement{
+			node:    child,
+			childID: -1,
+		})
+		if child.value != nil {
+			return
+		}
+	}
+}
+
+// keyFromDFSStack returns the bit string written on the path from the root
+// to the node on the top of the DFS stack.
+func (s *Stream) keyFromDFSStack() []byte {
+	// Calculate the buffer size.
+	var bitlen uint32 = 0
+	for i := 0; i < len(s.dfsStack)-1; i++ {
+		element := s.dfsStack[i]
+		bitlen += element.node.child[element.childID].bitlen
+	}
+	if bitlen%8 != 0 {
+		panic("the number of bytes in a key is not an integer")
+	}
+	// Allocate the buffer.
+	buf := make([]byte, bitlen>>3)
+	bitlen = 0
+	// Concatenate all key parts.
+	for i := 0; i < len(s.dfsStack)-1; i++ {
+		element := s.dfsStack[i]
+		buf = appendBits(buf, bitlen, element.node.child[element.childID].bitstr)
+		bitlen += element.node.child[element.childID].bitlen
+	}
+	return buf
+}
+
+// lowerBound initializes the DFS stack to store a path to the first node with
+// a key >= start and a non-nil value. lowerBound returns true iff that node
+// was found in the subtree.
+//
+// Invariant: the first bitIndex bits of start represent the path from
+// the root to the current node.
+func (s *Stream) lowerBound(node *pnode, bitIndex uint32, start []byte) bool {
+	if bitIndex >= bitlen(start) {
+		s.findFirst(node)
+		return true
+	}
+	top := s.pushToDFSStack(node)
+	// Pick the appropriate child and check that
+	// the bit string of the path to the child node
+	// matches the corresponding substring of start.
+	currBit := getBit(start, bitIndex)
+	top.childID = int8(currBit)
+	if child := node.child[currBit]; child != nil {
+		lcp := bitLCP(child, start[bitIndex>>3:], bitIndex&7)
+		if lcp < child.bitlen {
+			// child.bitstr doesn't match the substring of start.
+			// Find the first node with a value in a subtree of the child if
+			// child.bitstr is greater than the corresponding substring of
+			// start.
+			if bitIndex+lcp == bitlen(start) || (bitIndex+lcp < bitlen(start) && getBit(start, bitIndex+lcp) == 0) {
+				s.findFirst(child.node)
+				return true
+			}
+		} else if s.lowerBound(child.node, bitIndex+lcp, start) {
+			return true
+		}
+	}
+	if currBit == 0 && node.child[1] != nil {
+		top.childID = 1
+		s.findFirst(node.child[1].node)
+		return true
+	}
+	s.popFromDFSStack()
+	return false
+}
+
+// findFirst simulates the DFS to find the first node with a value in a subtree.
+// NOTE: since the Put/Delete implementation automatically removes
+// subtrees without values, each subtree of a ptrie has a value.
+func (s *Stream) findFirst(node *pnode) {
+	top := s.pushToDFSStack(node)
+	if node.value != nil {
+		return
+	}
+	for top.childID < 1 {
+		top.childID++
+		if child := node.child[top.childID]; child != nil {
+			s.findFirst(child.node)
+			return
+		}
+	}
+	panic("subtree has no nodes with values")
+}
+
+func (s *Stream) pushToDFSStack(node *pnode) *dfsStackElement {
+	top := &dfsStackElement{
+		node:    node,
+		childID: -1,
+	}
+	s.dfsStack = append(s.dfsStack, top)
+	return top
+}
+
+func (s *Stream) popFromDFSStack() {
+	s.dfsStack = s.dfsStack[:len(s.dfsStack)-1]
+}
diff --git a/services/syncbase/store/ptrie/util.go b/services/syncbase/store/ptrie/util.go
new file mode 100644
index 0000000..d1fa68d
--- /dev/null
+++ b/services/syncbase/store/ptrie/util.go
@@ -0,0 +1,109 @@
+// Copyright 2015 The Vanadium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package ptrie
+
+// mostSignificantBit stores the mapping from a byte to the position of its most
+// significant bit set to 1.
+var mostSignificantBit [256]uint32
+
+func init() {
+	for i := 2; i < 256; i++ {
+		mostSignificantBit[i] = mostSignificantBit[i>>1] + 1
+	}
+}
+
+// bitlen returns the length of a slice in bits, i.e. 8 * len(slice).
+func bitlen(slice []byte) uint32 {
+	return uint32(len(slice)) << 3
+}
+
+// getBit returns i-th lexicographically significant bit of a slice.
+// In other words, if we think of a byte slice as a big integer, getBit returns
+// the i-th bit of that integer counting from the highest to the lowest.
+func getBit(slice []byte, i uint32) byte {
+	return (slice[i>>3] >> (7 - (i & 7))) & 1
+}
+
+// sliceBitLCP returns the length of the longest common prefix between two
+// slices. The slices are compared as two bit strings. The first skipbits of
+// both slices are virtually removed and ignored.
+// NOTE: sliceBitLCP assumes that 0 <= skipbits < 8.
+func sliceBitLCP(a, b []byte, skipbits uint32) uint32 {
+	// Check that the first byte (without the first skipbits) is the same in
+	// both slices.
+	if x := (a[0] ^ b[0]) << skipbits; x != 0 {
+		return 7 - mostSignificantBit[x]
+	}
+	minlen := len(a)
+	if len(b) < minlen {
+		minlen = len(b)
+	}
+	for i := 1; i < minlen; i++ {
+		// (uint32(i<<3) - skipbits) --- the number of already compared bits.
+		if x := a[i] ^ b[i]; x != 0 {
+			return (uint32(i<<3) - skipbits) + (7 - mostSignificantBit[x])
+		}
+	}
+	return uint32(minlen<<3) - skipbits
+}
+
+// copyNode returns a copy of the provided pnode struct.
+func copyNode(node *pnode) *pnode {
+	if node == nil {
+		return nil
+	}
+	return &pnode{
+		value: node.value,
+		child: node.child,
+	}
+}
+
+// copyNode returns a copy of the provided child struct.
+func copyChild(child *pchild) *pchild {
+	if child == nil {
+		return nil
+	}
+	return &pchild{
+		node:   child.node,
+		bitstr: child.bitstr,
+		bitlen: child.bitlen,
+	}
+}
+
+// bitLCP returns the length of the longest common bit-prefix between the Path
+// of a child and a slice. The first skipbits of both slices are virtually
+// removed and ignored. Returns 0 if the child is nil.
+func bitLCP(child *pchild, slice []byte, skipbits uint32) uint32 {
+	if child == nil {
+		return 0
+	}
+	lcp := sliceBitLCP(child.bitstr, slice, skipbits)
+	if lcp > child.bitlen {
+		lcp = child.bitlen
+	}
+	return lcp
+}
+
+// appendBits appends 'b' to 'a', assuming that 'a' has the provided bit length.
+// If the bit length is a multiple of 8, appendBits just appends 'b' to 'a'.
+// Otherwise appendBits appends 'b' to 'a' overlapping the first byte of 'b'
+// with the last byte of 'a' so that the first bitlen bits of 'a' are unchanged.
+//
+// If 'a' has not enough capacity to hold the result, appendBits creates a new
+// slice to hold the result. Otherwise the result is stored in 'a'.
+func appendBits(a []byte, bitlen uint32, b []byte) []byte {
+	newlen := int(bitlen>>3) + len(b)
+	if newlen > cap(a) {
+		tmp := make([]byte, newlen)
+		copy(tmp, a)
+		a = tmp
+	}
+	a = a[:newlen]
+	oldByte := a[bitlen>>3]
+	copy(a[bitlen>>3:], b)
+	var bitmask byte = (1 << (8 - (bitlen & 7))) - 1
+	a[bitlen>>3] = (bitmask & a[bitlen>>3]) | (^bitmask & oldByte)
+	return a
+}
diff --git a/services/syncbase/store/ptrie/util_test.go b/services/syncbase/store/ptrie/util_test.go
new file mode 100644
index 0000000..0ab8892
--- /dev/null
+++ b/services/syncbase/store/ptrie/util_test.go
@@ -0,0 +1,74 @@
+// Copyright 2015 The Vanadium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package ptrie
+
+import (
+	"bytes"
+	"testing"
+)
+
+// TestGetBit verifies the getBit() function.
+func TestGetBit(t *testing.T) {
+	x := []byte{0x1b, 0x1b, 0x1b}
+	xbits := []byte{
+		0, 0, 0, 1, 1, 0, 1, 1,
+		0, 0, 0, 1, 1, 0, 1, 1,
+		0, 0, 0, 1, 1, 0, 1, 1,
+	}
+	for i := 0; i < 24; i++ {
+		if got, want := getBit(x, uint32(i)), xbits[i]; got != want {
+			t.Fatalf("unexpected bit %d: got %d, want %d", i, got, want)
+		}
+	}
+}
+
+// TestGetBit verifies the appendBits() function.
+func TestAppendBits(t *testing.T) {
+	a := []byte{0x99, 0x99}
+	b := []byte{0xff, 0xff}
+	if got, want := appendBits(a, 8, b), []byte{0x99, 0xff, 0xff}; !bytes.Equal(got, want) {
+		t.Fatalf("unexpected appendBits result: got %v, want %v", got, want)
+	}
+	if got, want := appendBits(a, 9, b), []byte{0x99, 0xff, 0xff}; !bytes.Equal(got, want) {
+		t.Fatalf("unexpected appendBits result: got %v, want %v", got, want)
+	}
+	if got, want := appendBits(a, 10, b), []byte{0x99, 0xbf, 0xff}; !bytes.Equal(got, want) {
+		t.Fatalf("unexpected appendBits result: got %v, want %v", got, want)
+	}
+	if got, want := appendBits(a, 11, b), []byte{0x99, 0x9f, 0xff}; !bytes.Equal(got, want) {
+		t.Fatalf("unexpected appendBits result: got %v, want %v", got, want)
+	}
+	if got, want := appendBits(a, 14, b), []byte{0x99, 0x9b, 0xff}; !bytes.Equal(got, want) {
+		t.Fatalf("unexpected appendBits result: got %v, want %v", got, want)
+	}
+	if got, want := appendBits(a, 16, b), []byte{0x99, 0x99, 0xff, 0xff}; !bytes.Equal(got, want) {
+		t.Fatalf("unexpected appendBits result: got %v, want %v", got, want)
+	}
+}
+
+// TestSliceBitLCP verifies the sliceBitLCP() function.
+func TestSliceBitLCP(t *testing.T) {
+	for i := 0; i < 8; i++ {
+		if got, want := sliceBitLCP([]byte{0x99, 0x99}, []byte{0x99, 0x99}, uint32(i)), uint32(16-i); got != want {
+			t.Fatalf("unexpected sliceBitLCP result: got %d, want %d", got, want)
+		}
+	}
+	for i := 0; i < 8; i++ {
+		if got, want := sliceBitLCP([]byte{0x99, 0x99}, []byte{0x99, 0xf9}, uint32(i)), uint32(9-i); got != want {
+			t.Fatalf("unexpected sliceBitLCP result: got %d, want %d", got, want)
+		}
+	}
+
+	for i := 0; i < 1; i++ {
+		if got, want := sliceBitLCP([]byte{0x99, 0x99}, []byte{0xd9, 0xf9}, uint32(i)), uint32(1-i); got != want {
+			t.Fatalf("unexpected sliceBitLCP result: got %d, want %d", got, want)
+		}
+	}
+	for i := 2; i < 8; i++ {
+		if got, want := sliceBitLCP([]byte{0x99, 0x99}, []byte{0xd9, 0xf9}, uint32(i)), uint32(9-i); got != want {
+			t.Fatalf("unexpected sliceBitLCP result: got %d, want %d", got, want)
+		}
+	}
+}