| // Copyright 2015 The Vanadium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package main |
| |
| import ( |
| "fmt" |
| "time" |
| |
| cloudmonitoring "google.golang.org/api/monitoring/v3" |
| |
| "v.io/jiri/tool" |
| "v.io/v23/context" |
| "v.io/v23/options" |
| "v.io/v23/rpc/reserved" |
| "v.io/v23/verror" |
| "v.io/x/devtools/internal/monitoring" |
| "v.io/x/devtools/internal/test" |
| ) |
| |
| var ( |
| // Empirically, running "debug stats read -json |
| // /ns.dev.v.io:8101/binaries/__debug/stats/rpc/server/routing-id/*/methods/*/latency-ms/delta1m |
| // ten times took a max of 14 seconds with a standard deviation of 2.6 |
| // seconds. So we take max + 2 x stdev =~ 20 seconds. |
| timeout = 20 * time.Second |
| ) |
| |
| type latencyData struct { |
| location *monitoring.ServiceLocation |
| latency time.Duration |
| } |
| |
| // checkServiceLatency checks all services and adds their check latency to GCM. |
| func checkServiceLatency(v23ctx *context.T, ctx *tool.Context, s *cloudmonitoring.Service) error { |
| serviceNames := []string{ |
| monitoring.SNMounttable, |
| monitoring.SNMacaroon, |
| monitoring.SNGoogleIdentity, |
| monitoring.SNBinaryDischarger, |
| monitoring.SNRole, |
| monitoring.SNProxy, |
| } |
| |
| hasError := false |
| mdLat, err := monitoring.GetMetric("service-latency", projectFlag) |
| if err != nil { |
| return err |
| } |
| now := time.Now().UTC().Format(time.RFC3339) |
| for _, serviceName := range serviceNames { |
| lats, err := checkSingleServiceLatency(v23ctx, ctx, serviceName) |
| if err != nil { |
| test.Fail(ctx, "%s\n", serviceName) |
| fmt.Fprintf(ctx.Stderr(), "%v\n", err) |
| hasError = true |
| continue |
| } |
| agg := newAggregator() |
| for _, lat := range lats { |
| instance := lat.location.Instance |
| zone := lat.location.Zone |
| latMs := float64(lat.latency.Nanoseconds()) / 1000000.0 |
| agg.add(latMs) |
| |
| // Send data to GCM. |
| if err := sendDataToGCM(s, mdLat, latMs, now, instance, zone, serviceName); err != nil { |
| return err |
| } |
| |
| label := fmt.Sprintf("%s (%s, %s)", serviceName, instance, zone) |
| if lat.latency == timeout { |
| test.Warn(ctx, "%s: %fms [TIMEOUT]\n", label, latMs) |
| } else { |
| test.Pass(ctx, "%s: %fms\n", label, latMs) |
| } |
| } |
| |
| // Send aggregated data to GCM. |
| mdAgg, err := monitoring.GetMetric("service-latency-agg", projectFlag) |
| if err != nil { |
| return err |
| } |
| if err := sendAggregatedDataToGCM(ctx, s, mdAgg, agg, now, serviceName); err != nil { |
| return err |
| } |
| } |
| if hasError { |
| return fmt.Errorf("Failed to check some services.") |
| } |
| return nil |
| } |
| |
| func checkSingleServiceLatency(v23ctx *context.T, ctx *tool.Context, serviceName string) ([]latencyData, error) { |
| // Get service's mounted name. |
| serviceMountedName, err := monitoring.GetServiceMountedName(namespaceRootFlag, serviceName) |
| if err != nil { |
| return nil, err |
| } |
| // For proxy, we send "signature" RPC to "proxy-mon/__debug" endpoint. |
| if serviceName == monitoring.SNProxy { |
| serviceMountedName = fmt.Sprintf("%s/__debug", serviceMountedName) |
| } |
| // Resolve name and group results by routing ids. |
| groups, err := monitoring.ResolveAndProcessServiceName(v23ctx, ctx, serviceName, serviceMountedName) |
| if err != nil { |
| return nil, err |
| } |
| |
| // For each group, get the latency from the first available name. |
| latencies := []latencyData{} |
| for _, group := range groups { |
| latency := timeout |
| v23ctx, cancel := context.WithTimeout(v23ctx, timeout) |
| defer cancel() |
| start := time.Now() |
| if _, err := reserved.Signature(v23ctx, "", options.Preresolved{&group}); err != nil { |
| if verror.ErrorID(err) != verror.ErrTimeout.ID { |
| // Fail immediately on non-timeout errors. |
| return nil, err |
| } |
| } else { |
| latency = time.Now().Sub(start) |
| } |
| location, err := monitoring.GetServiceLocation(v23ctx, ctx, group) |
| if err != nil { |
| return nil, err |
| } |
| latencies = append(latencies, latencyData{ |
| location: location, |
| latency: latency, |
| }) |
| } |
| |
| return latencies, nil |
| } |