profiles/internal/rpc: more information when server hangs waiting on listeners
The most common reason why prod servers fail to shutdown cleanly is that
server.Stop() hangs. This cl makes the following changes to improve our chances
of diagnosing when this happens:
- reduce the timeout after which we print that the server is stuck waiting on
listeners from 5 mins to 5 secs. 5 minutes is a really long time, and we
rarely wait that long. It would be good to signal that we're stuck as soon as
it's likely that we are.
- to mitigate the false positive rate for the above change, print out something
if the listener wait does in fact complete
- add information about which server is actually stuck (in particular, the type
of the dispatcher is useful to distinguish servers in a multi-server daemon
like the mounttable (not to mention that the app cycle manager server is part
of any server run by device manager))
- remove some unused endpoints field that I found by chance.
Change-Id: I14539c057b9f52a6b9afbc3a18cc0fb455bbbfa5
diff --git a/profiles/internal/rpc/server.go b/profiles/internal/rpc/server.go
index d1eeae6..da73807 100644
--- a/profiles/internal/rpc/server.go
+++ b/profiles/internal/rpc/server.go
@@ -105,9 +105,6 @@
// state of proxies keyed by the name of the proxy
proxies map[string]proxyState
- // all endpoints generated and returned by this server
- endpoints []naming.Endpoint
-
disp rpc.Dispatcher // dispatcher to serve RPCs
dispReserved rpc.Dispatcher // dispatcher for reserved methods
active sync.WaitGroup // active goroutines we've spawned.
@@ -823,7 +820,10 @@
}
func (s *server) Stop() error {
+ serverDebug := fmt.Sprintf("Dispatcher: %T, Status:[%v]", s.disp, s.Status())
defer vlog.LogCall()()
+ vlog.VI(1).Infof("Stop: %s", serverDebug)
+ defer vlog.VI(1).Infof("Stop done: %s", serverDebug)
s.Lock()
if s.isStopState() {
s.Unlock()
@@ -904,16 +904,17 @@
select {
case <-done:
- case <-time.After(5 * time.Minute):
- vlog.Errorf("Listener Close Error: %v", firstErr)
- vlog.Errorf("Timedout waiting for goroutines to stop: listeners: %d (currently: %d)", nListeners, len(s.listeners))
+ case <-time.After(5 * time.Second):
+ vlog.Errorf("%s: Listener Close Error: %v", serverDebug, firstErr)
+ vlog.Errorf("%s: Timedout waiting for goroutines to stop: listeners: %d (currently: %d)", serverDebug, nListeners, len(s.listeners))
for ln, _ := range s.listeners {
- vlog.Errorf("Listener: %p", ln)
+ vlog.Errorf("%s: Listener: %p", serverDebug, ln)
}
for ls, _ := range s.listenState {
- vlog.Errorf("ListenState: %v", ls)
+ vlog.Errorf("%s: ListenState: %v", serverDebug, ls)
}
<-done
+ vlog.Infof("%s: Done waiting.", serverDebug)
}
s.Lock()