blob: e88f0f3b1d077fe9f1193778322a76e476d74f3f [file] [log] [blame]
// Copyright 2015 The Vanadium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package lock provides a lock object to synchronize access to a directory
// among multiple processes.
//
// The implementation relies on lock files written inside the locked directory.
//
// To safely handle situations where a process holding the lock exits without
// releasing the lock, the implementation employs a multi-level locking
// strategy: lock level 0 is considered the master lock. If the master lock is
// found to be stale (the owning process is dead), the process seeking the lock
// will need to grab lock level 1 before it can re-claim lock level 0 (only one
// process can do so, ensuring only one process proceeds to own lock level 0).
//
// If in turn lock level 1 is found to be stale, the process will need to grab
// lock level 2, and so on. Eventually, the process seeking the lock will reach
// a level N where the lock is not stale: if the level N lock is available, the
// process grabs it, reclaims lock level 0 for itself (after checking it's
// allowed to), and removes all locks for levels > 0.
//
// If the level N lock is held by another process, the process seeking the lock
// will retry starting at level 0.
//
// Lock level 0 determines who holds the lock once TryLock returns. Higher
// level locks are only held while TryLock is running, and are ephemeral. This
// scheme ensures that exactly one process ever grabs the lock regardless of
// whether it's stale.
package lock
import (
"bytes"
"fmt"
"io/ioutil"
"math/rand"
"os"
"path/filepath"
"strconv"
"sync"
"time"
"v.io/x/ref/lib/timekeeper"
"v.io/x/ref/services/agent/internal/lockutil"
)
// TryLocker is a sync.Locker augmented with TryLock.
type TryLocker interface {
sync.Locker
// TryLock attempts to grab the lock, but does not hang if the lock is
// actively held by another process. Instead, it returns false.
TryLock() bool
}
// TryLockerSafe is like TryLocker, but the methods can return an error and
// never panic.
type TryLockerSafe interface {
// TryLock attempts to grab the lock, but does not hang if the lock is
// actively held by another process. Instead, it returns false.
TryLock() (bool, error)
// Lock blocks until it's able to grab the lock.
Lock() error
// Unlock releases the lock. Should only be called when the lock is
// held.
Unlock() error
// Must returns a TryLocker whose Lock, TryLock and Unlock methods
// panic rather than return errors.
Must() TryLocker
}
const (
lockFileName = "lock"
tempFileName = "templock"
maxTries = 50
maxIndex = 20
sleepTime = 100 * time.Millisecond
sleepJitter = 20 * time.Millisecond // Should be < sleepTime.
)
// NewDirLock creates a new TryLockerSafe that can be used to manipulate a file
// lock for a directory.
func NewDirLock(dir string) TryLockerSafe {
return newDirLock(dir, timekeeper.RealTime())
}
type dirLock struct {
// dir is the directory being locked.
dir string
// timeKeeper allows injection of controlled time for testing.
timeKeeper timekeeper.TimeKeeper
// tryLockHook allows injection of test logic at the end of tryLock.
tryLockHook func()
// randomizer is used to jitter sleep time. Guarded by randMutex.
randomizer *rand.Rand
// randMutex guards against concurrent access to randomizer.
randMutex sync.Mutex
}
func newDirLock(dir string, t timekeeper.TimeKeeper) *dirLock {
return &dirLock{
dir: dir,
timeKeeper: t,
randomizer: rand.New(rand.NewSource(time.Now().UnixNano())),
}
}
// lockFile returns the path to the lock file for level #index.
func (l *dirLock) lockFile(index int) string {
return filepath.Join(l.dir, lockFileName+strconv.Itoa(index))
}
// readLockInfo returns the lock process information for lock level #index.
func (l *dirLock) readLockInfo(index int) ([]byte, error) {
return ioutil.ReadFile(l.lockFile(index))
}
type lockOutcome interface {
grabbed() bool
}
type grabbedLock bool
func (gl grabbedLock) grabbed() bool {
return bool(gl)
}
type staleLock string
func (staleLock) grabbed() bool {
return false
}
type failedLock struct {
error
}
func (failedLock) grabbed() bool {
return false
}
// tryLock tries to grab lock level #index.
func (l *dirLock) tryLock(index int) lockOutcome {
if l.tryLockHook != nil {
defer l.tryLockHook()
}
tmpFile, err := lockutil.CreateLockFile(l.dir, tempFileName)
if err != nil {
return failedLock{err}
}
defer os.Remove(tmpFile)
if err = os.Link(tmpFile, l.lockFile(index)); err == nil {
// Successfully grabbed the lock.
return grabbedLock(true)
}
if linkErr, ok := err.(*os.LinkError); !ok || !os.IsExist(linkErr.Err) {
// Some unexpected error occured.
return failedLock{err}
}
// Lock is either actively held, or stale. Figure out which.
lockInfo, err := l.readLockInfo(index)
if err != nil {
if os.IsNotExist(err) {
// Lock was just released. Count that as already held.
return grabbedLock(false)
}
return failedLock{err}
}
if running, err := lockutil.StillHeld(lockInfo); err != nil {
return failedLock{err}
} else if running {
// Lock is held by someone else.
return grabbedLock(false)
}
// Lock appears to be stale.
return staleLock(lockInfo)
}
// poachLock grabs lock level #index forcefully. Meant to be called when we're
// entitled to own the lock.
func (l *dirLock) poachLock(index int) error {
tmpFile, err := lockutil.CreateLockFile(l.dir, tempFileName)
if err != nil {
return nil
}
defer os.Remove(tmpFile)
return os.Rename(tmpFile, l.lockFile(index))
}
// unlock removes lock level #index. Meant to be called when we own the lock.
func (l *dirLock) unlock(index int) error {
return os.Remove(l.lockFile(index))
}
func (l *dirLock) sleep() {
// randomize sleep time to avoid sleep/wake cycle alignment among
// processes contending for the same lock.
l.randMutex.Lock()
sleepDuration := sleepTime + time.Duration(l.randomizer.Int63n(int64(sleepJitter)))
l.randMutex.Unlock()
l.timeKeeper.Sleep(sleepDuration)
}
// TryLock implements TryLockerSafe.TryLock.
func (l *dirLock) TryLock() (bool, error) {
retry:
for tries := 0; tries < maxTries; tries++ {
var masterLockInfo []byte
// Lock level #0 is the master lock.
switch t := l.tryLock(0).(type) {
default:
return false, fmt.Errorf("unexpected type: %T", t)
case failedLock:
return false, t.error
case grabbedLock:
// Either we grabbed the master lock, or it's being
// legitimately held by another process.
return bool(t), nil
case staleLock:
// Master lock appears to be stale. Proceed to grab the
// next level lock.
masterLockInfo = []byte(t)
}
// Go up the levels, trying to find the first non-stale lock.
for index := 1; index < maxIndex; index++ {
switch t := l.tryLock(index).(type) {
default:
return false, fmt.Errorf("unexpected type: %T", t)
case failedLock:
return false, t.error
case grabbedLock:
if !bool(t) {
// The lock is being legitimately held
// by some other process. Retry from
// level 0 after some delay.
l.sleep()
continue retry
}
// We grabbed lock level #index. Before taking
// over the master lock, we need to make sure
// the master lock has not changed since we
// examined it (this ensures that it's safe for
// the legitimate owner of the master lock to
// remove all higher level locks).
currMasterLockInfo, err := l.readLockInfo(0)
if err != nil && !os.IsNotExist(err) {
l.unlock(index)
return false, err
}
if err != nil || !bytes.Equal(currMasterLockInfo, masterLockInfo) {
// The master lock has changed. We are
// not the rightful owner. Retry from
// level 0 after some delay.
l.unlock(index)
l.sleep()
continue retry
}
case staleLock:
// Lock appears to be stale. Proceed to the
// next level lock.
continue
}
// We grabbed the lock and are the rightful owner.
// Steal the master lock and remove higher level locks.
if err := l.poachLock(0); err != nil {
l.unlock(index)
return false, err
}
// It's now safe to clear higher level locks since any
// other processes waiting to grab higher level locks
// will have to resume from level 0 once they notice
// that the master lock info has changed.
for i := index; i > 0; i-- {
l.unlock(i)
}
return true, nil
}
return false, fmt.Errorf("max number of lock levels exceeded: %d", maxIndex)
}
return false, fmt.Errorf("max number of tries exceeded: %d", maxTries)
}
// Lock implements TryLockerSafe.Lock.
func (l *dirLock) Lock() error {
for {
if locked, err := l.TryLock(); err != nil {
return err
} else if locked {
return nil
}
// Lock is held by another (live) process. Retry after an
// appropriate delay.
l.sleep()
}
}
// Unlock implements TryLockerSafe.Unlock.
func (l *dirLock) Unlock() error {
return l.unlock(0)
}
type mustLock struct {
l TryLockerSafe
}
// Lock implements sync.Locker.Lock.
func (m *mustLock) Lock() {
if err := m.l.Lock(); err != nil {
panic(err)
}
}
// TryLock implements TryLocker.TryLock.
func (m *mustLock) TryLock() bool {
got, err := m.l.TryLock()
if err != nil {
panic(err)
}
return got
}
// Unlock implements sync.Locker.Unlock.
func (m *mustLock) Unlock() {
if err := m.l.Unlock(); err != nil {
panic(err)
}
}
// Must implements TryLockerSafe.Must.
func (l *dirLock) Must() TryLocker {
return &mustLock{l}
}