veyron/tools/playground: Add script to monitor GCE replicas.
Change-Id: If0bc6b97f1dfd6fa8e1e5f40f0641a0426751dd4
diff --git a/tools/playground/monitor.py b/tools/playground/monitor.py
new file mode 100755
index 0000000..300ef85
--- /dev/null
+++ b/tools/playground/monitor.py
@@ -0,0 +1,64 @@
+#!/usr/bin/python2.7
+
+# This needs to run on a gce vm with the replica pool
+# service account scope (https://www.googleapis.com/auth/ndev.cloudman).
+#
+# You also need to enable preview in gcloud:
+# $ gcloud components update preview
+#
+# Then add it to your crontab, e.g.
+# */10 * * * * gcloud preview replica-pools --zone us-central1-a replicas --pool playground-pool list|monitor.py
+
+import os
+import datetime
+import subprocess
+import sys
+import yaml
+
+DESIRED = 2
+MAX_ALIVE_MIN = 60
+POOL = "playground-pool"
+
+def runCommand(*args):
+ cmd = ['gcloud', 'preview', 'replica-pools', '--zone', 'us-central1-a']
+ cmd.extend(args)
+ subprocess.check_call(cmd)
+
+def resizePool(size):
+ runCommand("resize", "--new-size", str(size), POOL)
+
+
+def shouldRestart(replica):
+ if replica['status']['state'] == 'PERMANENTLY_FAILING':
+ print "replica %s failed: %s" % (replica['name'], replica['status']['details'])
+ return True
+ return isTooOld(replica)
+
+
+def isTooOld(replica):
+ start_text = replica['status']['vmStartTime']
+ if start_text:
+ start = yaml.load(start_text)
+ uptime = datetime.datetime.now() - start
+ return uptime.seconds > MAX_ALIVE_MIN * 60
+
+
+def restartReplica(replica):
+ print "Restarting replica " + replica['name']
+ resizePool(DESIRED + 1)
+ runCommand("replicas", "--pool", POOL, "delete", replica['name'])
+
+
+def maybeRestartReplica(replica):
+ if shouldRestart(replica):
+ restartReplica(replica)
+
+
+def main():
+ replicas = yaml.load_all(sys.stdin.read())
+ for replica in replicas:
+ maybeRestartReplica(replica)
+
+
+if __name__ == "__main__":
+ main()