blob: 8856a805947b66e537deb9b5bf5cf8b7dd565ec7 [file] [log] [blame]
// Copyright 2015 The Vanadium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package main
import (
"fmt"
"time"
cloudmonitoring "google.golang.org/api/monitoring/v3"
"v.io/jiri/tool"
"v.io/v23/context"
"v.io/v23/options"
"v.io/v23/rpc/reserved"
"v.io/v23/verror"
"v.io/x/devtools/internal/monitoring"
"v.io/x/devtools/internal/test"
)
var (
// Empirically, running "debug stats read -json
// /ns.dev.v.io:8101/binaries/__debug/stats/rpc/server/routing-id/*/methods/*/latency-ms/delta1m
// ten times took a max of 14 seconds with a standard deviation of 2.6
// seconds. So we take max + 2 x stdev =~ 20 seconds.
timeout = 20 * time.Second
)
type latencyData struct {
location *monitoring.ServiceLocation
latency time.Duration
}
// checkServiceLatency checks all services and adds their check latency to GCM.
func checkServiceLatency(v23ctx *context.T, ctx *tool.Context, s *cloudmonitoring.Service) error {
serviceNames := []string{
monitoring.SNMounttable,
monitoring.SNMacaroon,
monitoring.SNGoogleIdentity,
monitoring.SNBinaryDischarger,
monitoring.SNRole,
monitoring.SNProxy,
}
hasError := false
mdLat, err := monitoring.GetMetric("service-latency", projectFlag)
if err != nil {
return err
}
now := time.Now().UTC().Format(time.RFC3339)
for _, serviceName := range serviceNames {
lats, err := checkSingleServiceLatency(v23ctx, ctx, serviceName)
if err != nil {
test.Fail(ctx, "%s\n", serviceName)
fmt.Fprintf(ctx.Stderr(), "%v\n", err)
hasError = true
continue
}
agg := newAggregator()
for _, lat := range lats {
instance := lat.location.Instance
zone := lat.location.Zone
latMs := float64(lat.latency.Nanoseconds()) / 1000000.0
agg.add(latMs)
// Send data to GCM.
if err := sendDataToGCM(s, mdLat, latMs, now, instance, zone, serviceName); err != nil {
return err
}
label := fmt.Sprintf("%s (%s, %s)", serviceName, instance, zone)
if lat.latency == timeout {
test.Warn(ctx, "%s: %fms [TIMEOUT]\n", label, latMs)
} else {
test.Pass(ctx, "%s: %fms\n", label, latMs)
}
}
// Send aggregated data to GCM.
mdAgg, err := monitoring.GetMetric("service-latency-agg", projectFlag)
if err != nil {
return err
}
if err := sendAggregatedDataToGCM(ctx, s, mdAgg, agg, now, serviceName); err != nil {
return err
}
}
if hasError {
return fmt.Errorf("Failed to check some services.")
}
return nil
}
func checkSingleServiceLatency(v23ctx *context.T, ctx *tool.Context, serviceName string) ([]latencyData, error) {
// Get service's mounted name.
serviceMountedName, err := monitoring.GetServiceMountedName(namespaceRootFlag, serviceName)
if err != nil {
return nil, err
}
// For proxy, we send "signature" RPC to "proxy-mon/__debug" endpoint.
if serviceName == monitoring.SNProxy {
serviceMountedName = fmt.Sprintf("%s/__debug", serviceMountedName)
}
// Resolve name and group results by routing ids.
groups, err := monitoring.ResolveAndProcessServiceName(v23ctx, ctx, serviceName, serviceMountedName)
if err != nil {
return nil, err
}
// For each group, get the latency from the first available name.
latencies := []latencyData{}
for _, group := range groups {
latency := timeout
v23ctx, cancel := context.WithTimeout(v23ctx, timeout)
defer cancel()
start := time.Now()
if _, err := reserved.Signature(v23ctx, "", options.Preresolved{&group}); err != nil {
if verror.ErrorID(err) != verror.ErrTimeout.ID {
// Fail immediately on non-timeout errors.
return nil, err
}
} else {
latency = time.Now().Sub(start)
}
location, err := monitoring.GetServiceLocation(v23ctx, ctx, group)
if err != nil {
return nil, err
}
latencies = append(latencies, latencyData{
location: location,
latency: latency,
})
}
return latencies, nil
}