Merge pull request #5971 from sreecha/stress_test_flexible_config

Stress test flexible config
author: Jan Tattermusch <jtattermusch@users.noreply.github.com> 2016-03-29 09:05:43 -0700
committer: Jan Tattermusch <jtattermusch@users.noreply.github.com> 2016-03-29 09:05:43 -0700
commit: 46962bb5f88fa84cecb6ef1246f3307a4a13f04f (patch)
tree: 1342a5cd2d7c3b524fb55dfda685d3f85fc277dc /tools
parent: d2eb23974d62e7d6f2ed6486ba559e36f16b8081 (diff)
parent: e68ec434804949209329c5953b65ae31e920a620 (diff)
8 files changed, 916 insertions, 571 deletions
diff --git a/tools/dockerfile/grpc_interop_stress_cxx/Dockerfile b/tools/dockerfile/grpc_interop_stress_cxx/Dockerfile
index 4123cc1a26..214747fd4a 100644
--- a/tools/dockerfile/grpc_interop_stress_cxx/Dockerfile
+++ b/tools/dockerfile/grpc_interop_stress_cxx/Dockerfile
@@ -27,12 +27,9 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-# A work-in-progress Dockerfile that allows running gRPC test suites
-# inside a docker container.
-
 FROM debian:jessie
 
-# Install Git.
+# Install Git and basic packages.
 RUN apt-get update && apt-get install -y \
   autoconf \
   autotools-dev \
@@ -43,13 +40,16 @@ RUN apt-get update && apt-get install -y \
   gcc \
   gcc-multilib \
   git \
+  golang \
   gyp \
+  lcov \
   libc6 \
   libc6-dbg \
   libc6-dev \
   libgtest-dev \
   libtool \
   make \
+  perl \
   strace \
   python-dev \
   python-setuptools \
@@ -59,7 +59,9 @@ RUN apt-get update && apt-get install -y \
   wget \
   zip && apt-get clean
 
-RUN easy_install -U pip
+#================
+# Build profiling
+RUN apt-get update && apt-get install -y time && apt-get clean
 
 # Prepare ccache
 RUN ln -s /usr/bin/ccache /usr/local/bin/gcc
@@ -69,12 +71,47 @@ RUN ln -s /usr/bin/ccache /usr/local/bin/c++
 RUN ln -s /usr/bin/ccache /usr/local/bin/clang
 RUN ln -s /usr/bin/ccache /usr/local/bin/clang++
 
-##################
+#=================
 # C++ dependencies
-RUN apt-get update && apt-get -y install libgflags-dev libgtest-dev libc++-dev clang
+RUN apt-get update && apt-get -y install libgflags-dev libgtest-dev libc++-dev clang && apt-get clean
 
-# Google Cloud platform API libraries (for BigQuery)
+# Google Cloud platform API libraries
+RUN apt-get update && apt-get install -y python-pip && apt-get clean
 RUN pip install --upgrade google-api-python-client
 
+
+#=================
+# Update clang to a version with improved tsan
+
+RUN apt-get update && apt-get -y install python cmake && apt-get clean
+
+RUN git clone -n -b release_38 http://llvm.org/git/llvm.git && \
+  cd llvm && git checkout ad57503 && cd ..
+RUN git clone -n -b release_38 http://llvm.org/git/clang.git && \
+  cd clang && git checkout ad2c56e && cd ..
+RUN git clone -n -b release_38 http://llvm.org/git/compiler-rt.git && \
+  cd compiler-rt && git checkout 3176922 && cd ..
+RUN git clone -n -b release_38 \
+  http://llvm.org/git/clang-tools-extra.git && cd clang-tools-extra && \
+  git checkout c288525 && cd ..
+RUN git clone -n -b release_38 http://llvm.org/git/libcxx.git && \
+  cd libcxx && git checkout fda3549  && cd ..
+RUN git clone -n -b release_38 http://llvm.org/git/libcxxabi.git && \
+  cd libcxxabi && git checkout 8d4e51d && cd ..
+
+RUN mv clang llvm/tools
+RUN mv compiler-rt llvm/projects
+RUN mv clang-tools-extra llvm/tools/clang/tools
+RUN mv libcxx llvm/projects
+RUN mv libcxxabi llvm/projects
+
+RUN mkdir llvm-build
+RUN cd llvm-build && cmake \
+  -DCMAKE_BUILD_TYPE:STRING=Release \
+  -DCMAKE_INSTALL_PREFIX:STRING=/usr \
+  -DLLVM_TARGETS_TO_BUILD:STRING=X86 \
+  ../llvm
+RUN make -C llvm-build && make -C llvm-build install && rm -rf llvm-build
+
 # Define the default command.
 CMD ["bash"]
diff --git a/tools/dockerfile/grpc_interop_stress_cxx/build_interop_stress.sh b/tools/dockerfile/grpc_interop_stress_cxx/build_interop_stress.sh
index 392bdfccda..470db4c13f 100755
--- a/tools/dockerfile/grpc_interop_stress_cxx/build_interop_stress.sh
+++ b/tools/dockerfile/grpc_interop_stress_cxx/build_interop_stress.sh
@@ -41,5 +41,7 @@ cd /var/local/git/grpc
 
 make install-certs
 
+BUILD_TYPE=${BUILD_TYPE:=opt}
+
 # build C++ interop stress client, interop client and server
-make stress_test metrics_client interop_client interop_server
+make CONFIG=$BUILD_TYPE stress_test metrics_client interop_client interop_server
diff --git a/tools/jenkins/build_interop_stress_image.sh b/tools/jenkins/build_interop_stress_image.sh
index 501dc5b7ca..b5dbcc5ce4 100755
--- a/tools/jenkins/build_interop_stress_image.sh
+++ b/tools/jenkins/build_interop_stress_image.sh
@@ -34,10 +34,12 @@
 set -x
 
 # Params:
-#  INTEROP_IMAGE - name of tag of the final interop image
+#  INTEROP_IMAGE - Name of tag of the final interop image
 #  INTEROP_IMAGE_TAG - Optional. If set, the created image will be tagged using
 #    the command: 'docker tag $INTEROP_IMAGE $INTEROP_IMAGE_REPOSITORY_TAG'
-#  BASE_NAME - base name used to locate the base Dockerfile and build script
+#  BASE_NAME - Base name used to locate the base Dockerfile and build script
+#  BUILD_TYPE - The 'CONFIG' variable passed to the 'make' command (example:
+#  asan, tsan. Default value: opt).
 #  TTY_FLAG - optional -t flag to make docker allocate tty
 #  BUILD_INTEROP_DOCKER_EXTRA_ARGS - optional args to be passed to the
 #    docker run command
@@ -71,6 +73,7 @@ CONTAINER_NAME="build_${BASE_NAME}_$(uuidgen)"
 (docker run \
   -e CCACHE_DIR=/tmp/ccache \
   -e THIS_IS_REALLY_NEEDED='see https://github.com/docker/docker/issues/14203 for why docker is awful' \
+  -e BUILD_TYPE=${BUILD_TYPE:=opt} \
   -i $TTY_FLAG \
   $MOUNT_ARGS \
   $BUILD_INTEROP_DOCKER_EXTRA_ARGS \
diff --git a/tools/run_tests/stress_test/README.md b/tools/run_tests/stress_test/README.md
index 1a48e90c69..84f9719cb1 100644
--- a/tools/run_tests/stress_test/README.md
+++ b/tools/run_tests/stress_test/README.md
@@ -67,8 +67,10 @@ The script has several parameters and you can find out more details by using the
   - `<grpc_root_dir>$ tools/run_tests/stress_test/run_stress_tests_on_gke.py --help`
 
 > **Example**
-> `$ tools/run_tests/stress_test/run_stress_tests_on_gke.py --project_id=sree-gce --test_duration_secs=180 --num_clients=5`
+> ```bash
+> $ # Change to the grpc root directory
+> $ cd $GRPC_ROOT
+> $ tools/run_tests/stress_test/run_on_gke.py --project_id=sree-gce --config_file=tools/run_tests/stress_test/configs/opt.json
+> ```
 
-> Launches the 5 instances of stress test clients, 1 instance of stress test server and runs the test for 180 seconds. The test would be run on the default container cluster (that you have set in `gcloud`) in the project `sree-gce`.
-
-> Note: we currently do not have the ability to launch multiple instances of the server. This can be added very easily in future
+> The above runs the stress test on GKE under the project `sree-gce` in the default cluster (that you set by `gcloud` command earlier). The test settings (like number of client instances, servers, the parmeters to pass, test cases etc) are all loaded from the config file `$GRPC_ROOT/tools/run_tests/stress_test/opt.json`
diff --git a/tools/run_tests/stress_test/configs/opt-tsan-asan.json b/tools/run_tests/stress_test/configs/opt-tsan-asan.json
new file mode 100644
index 0000000000..1dc2d3fe08
--- /dev/null
+++ b/tools/run_tests/stress_test/configs/opt-tsan-asan.json
@@ -0,0 +1,135 @@
+{
+  "dockerImages": {
+    "grpc_stress_cxx_opt" : {
+      "buildScript": "tools/jenkins/build_interop_stress_image.sh",
+      "dockerFileDir": "grpc_interop_stress_cxx",
+      "buildType": "opt"
+    },
+    "grpc_stress_cxx_tsan": {
+      "buildScript": "tools/jenkins/build_interop_stress_image.sh",
+      "dockerFileDir": "grpc_interop_stress_cxx",
+      "buildType": "tsan"
+    },
+    "grpc_stress_cxx_asan": {
+      "buildScript": "tools/jenkins/build_interop_stress_image.sh",
+      "dockerFileDir": "grpc_interop_stress_cxx",
+      "buildType": "asan"
+    }
+  },
+
+  "clientTemplates": {
+    "baseTemplates": {
+      "default": {
+        "wrapperScriptPath": "/var/local/git/grpc/tools/gcp/stress_test/run_client.py",
+        "pollIntervalSecs": 60,
+        "clientArgs": {
+          "num_channels_per_server":5,
+          "num_stubs_per_channel":10,
+          "test_cases": "empty_unary:1,large_unary:1,client_streaming:1,server_streaming:1,empty_stream:1",
+          "metrics_port": 8081,
+          "metrics_collection_interval_secs":60
+        },
+        "metricsPort": 8081,
+        "metricsArgs": {
+          "metrics_server_address": "localhost:8081",
+          "total_only": "true"
+        }
+      }
+    },
+    "templates": {
+      "cxx_client_opt": {
+        "baseTemplate": "default",
+        "clientImagePath": "/var/local/git/grpc/bins/opt/stress_test",
+        "metricsClientImagePath": "/var/local/git/grpc/bins/opt/metrics_client"
+      },
+      "cxx_client_tsan": {
+        "baseTemplate": "default",
+        "clientImagePath": "/var/local/git/grpc/bins/tsan/stress_test",
+        "metricsClientImagePath": "/var/local/git/grpc/bins/tsan/metrics_client"
+      },
+    "cxx_client_asan": {
+        "baseTemplate": "default",
+        "clientImagePath": "/var/local/git/grpc/bins/asan/stress_test",
+        "metricsClientImagePath": "/var/local/git/grpc/bins/asan/metrics_client"
+      }
+    }
+  },
+
+  "serverTemplates": {
+    "baseTemplates":{
+      "default": {
+        "wrapperScriptPath": "/var/local/git/grpc/tools/gcp/stress_test/run_server.py",
+        "serverPort": 8080,
+        "serverArgs": {
+          "port": 8080
+        }
+      }
+    },
+    "templates": {
+      "cxx_server_opt": {
+        "baseTemplate": "default",
+        "serverImagePath": "/var/local/git/grpc/bins/opt/interop_server"
+      },
+      "cxx_server_tsan": {
+        "baseTemplate": "default",
+        "serverImagePath": "/var/local/git/grpc/bins/tsan/interop_server"
+      },
+    "cxx_server_asan": {
+        "baseTemplate": "default",
+        "serverImagePath": "/var/local/git/grpc/bins/asan/interop_server"
+      }
+    }
+  },
+
+  "testMatrix": {
+    "serverPodSpecs": {
+      "stress-server-opt": {
+        "serverTemplate": "cxx_server_opt",
+        "dockerImage": "grpc_stress_cxx_opt",
+        "numInstances": 1
+      },
+      "stress-server-tsan": {
+        "serverTemplate": "cxx_server_tsan",
+        "dockerImage": "grpc_stress_cxx_tsan",
+        "numInstances": 1
+      },
+      "stress-server-asan": {
+        "serverTemplate": "cxx_server_asan",
+        "dockerImage": "grpc_stress_cxx_asan",
+        "numInstances": 1
+      }
+   },
+
+    "clientPodSpecs": {
+      "stress-client-opt": {
+        "clientTemplate": "cxx_client_opt",
+        "dockerImage": "grpc_stress_cxx_opt",
+        "numInstances": 3,
+        "serverPodSpec": "stress-server-opt"
+      },
+      "stress-client-tsan": {
+        "clientTemplate": "cxx_client_tsan",
+        "dockerImage": "grpc_stress_cxx_tsan",
+        "numInstances": 3,
+        "serverPodSpec": "stress-server-tsan"
+      },
+      "stress-client-asan": {
+        "clientTemplate": "cxx_client_asan",
+        "dockerImage": "grpc_stress_cxx_asan",
+        "numInstances": 3,
+        "serverPodSpec": "stress-server-asan"
+      }
+    }
+  },
+
+  "globalSettings": {
+    "buildDockerImages": true,
+    "pollIntervalSecs": 60,
+    "testDurationSecs": 7200,
+    "kubernetesProxyPort": 8001,
+    "datasetIdNamePrefix": "stress_test_opt_tsan",
+    "summaryTableId": "summary",
+    "qpsTableId": "qps",
+    "podWarmupSecs": 60
+  }
+}
diff --git a/tools/run_tests/stress_test/configs/opt.json b/tools/run_tests/stress_test/configs/opt.json
new file mode 100644
index 0000000000..7fc024034b
--- /dev/null
+++ b/tools/run_tests/stress_test/configs/opt.json
@@ -0,0 +1,86 @@
+{
+  "dockerImages": {
+    "grpc_stress_cxx_opt" : {
+      "buildScript": "tools/jenkins/build_interop_stress_image.sh",
+      "dockerFileDir": "grpc_interop_stress_cxx",
+      "buildType": "opt"
+    }
+  },
+
+  "clientTemplates": {
+    "baseTemplates": {
+      "default": {
+        "wrapperScriptPath": "/var/local/git/grpc/tools/gcp/stress_test/run_client.py",
+        "pollIntervalSecs": 60,
+        "clientArgs": {
+          "num_channels_per_server":5,
+          "num_stubs_per_channel":10,
+          "test_cases": "empty_unary:1,large_unary:1,client_streaming:1,server_streaming:1,empty_stream:1",
+          "metrics_port": 8081,
+          "metrics_collection_interval_secs":60
+        },
+        "metricsPort": 8081,
+        "metricsArgs": {
+          "metrics_server_address": "localhost:8081",
+          "total_only": "true"
+        }
+      }
+    },
+    "templates": {
+      "cxx_client_opt": {
+        "baseTemplate": "default",
+        "clientImagePath": "/var/local/git/grpc/bins/opt/stress_test",
+        "metricsClientImagePath": "/var/local/git/grpc/bins/opt/metrics_client"
+      }
+    }
+  },
+
+  "serverTemplates": {
+    "baseTemplates":{
+      "default": {
+        "wrapperScriptPath": "/var/local/git/grpc/tools/gcp/stress_test/run_server.py",
+        "serverPort": 8080,
+        "serverArgs": {
+          "port": 8080
+        }
+      }
+    },
+    "templates": {
+      "cxx_server_opt": {
+        "baseTemplate": "default",
+        "serverImagePath": "/var/local/git/grpc/bins/opt/interop_server"
+      }
+    }
+  },
+
+  "testMatrix": {
+    "serverPodSpecs": {
+      "stress-server-opt": {
+        "serverTemplate": "cxx_server_opt",
+        "dockerImage": "grpc_stress_cxx_opt",
+        "numInstances": 1
+      }
+    },
+
+    "clientPodSpecs": {
+      "stress-client-opt": {
+        "clientTemplate": "cxx_client_opt",
+        "dockerImage": "grpc_stress_cxx_opt",
+        "numInstances": 10,
+        "serverPodSpec": "stress-server-opt"
+      }
+    }
+  },
+
+  "globalSettings": {
+    "buildDockerImages": true,
+    "pollIntervalSecs": 10,
+    "testDurationSecs": 120,
+    "kubernetesProxyPort": 8001,
+    "datasetIdNamePrefix": "stress_test_opt",
+    "summaryTableId": "summary",
+    "qpsTableId": "qps",
+    "podWarmupSecs": 60
+  }
+}
+
diff --git a/tools/run_tests/stress_test/run_on_gke.py b/tools/run_tests/stress_test/run_on_gke.py
new file mode 100755
index 0000000000..3a81c1a376
--- /dev/null
+++ b/tools/run_tests/stress_test/run_on_gke.py
@@ -0,0 +1,636 @@
+#!/usr/bin/env python2.7
+# Copyright 2015-2016, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import argparse
+import datetime
+import json
+import os
+import subprocess
+import sys
+import time
+
+stress_test_utils_dir = os.path.abspath(os.path.join(
+    os.path.dirname(__file__), '../../gcp/stress_test'))
+sys.path.append(stress_test_utils_dir)
+from stress_test_utils import BigQueryHelper
+
+kubernetes_api_dir = os.path.abspath(os.path.join(
+    os.path.dirname(__file__), '../../gcp/utils'))
+sys.path.append(kubernetes_api_dir)
+
+import kubernetes_api
+
+
+class GlobalSettings:
+
+  def __init__(self, gcp_project_id, build_docker_images,
+               test_poll_interval_secs, test_duration_secs,
+               kubernetes_proxy_port, dataset_id_prefix, summary_table_id,
+               qps_table_id, pod_warmup_secs):
+    self.gcp_project_id = gcp_project_id
+    self.build_docker_images = build_docker_images
+    self.test_poll_interval_secs = test_poll_interval_secs
+    self.test_duration_secs = test_duration_secs
+    self.kubernetes_proxy_port = kubernetes_proxy_port
+    self.dataset_id_prefix = dataset_id_prefix
+    self.summary_table_id = summary_table_id
+    self.qps_table_id = qps_table_id
+    self.pod_warmup_secs = pod_warmup_secs
+
+
+class ClientTemplate:
+  """ Contains all the common settings that are used by a stress client """
+
+  def __init__(self, name, client_image_path, metrics_client_image_path,
+               metrics_port, wrapper_script_path, poll_interval_secs,
+               client_args_dict, metrics_args_dict):
+    self.name = name
+    self.client_image_path = client_image_path
+    self.metrics_client_image_path = metrics_client_image_path
+    self.metrics_port = metrics_port
+    self.wrapper_script_path = wrapper_script_path
+    self.poll_interval_secs = poll_interval_secs
+    self.client_args_dict = client_args_dict
+    self.metrics_args_dict = metrics_args_dict
+
+
+class ServerTemplate:
+  """ Contains all the common settings used by a stress server """
+
+  def __init__(self, name, server_image_path, wrapper_script_path, server_port,
+               server_args_dict):
+    self.name = name
+    self.server_image_path = server_image_path
+    self.wrapper_script_path = wrapper_script_path
+    self.server_port = server_port
+    self.server_args_dict = server_args_dict
+
+
+class DockerImage:
+  """ Represents properties of a Docker image. Provides methods to build the
+  image and push it to GKE registry
+  """
+
+  def __init__(self, gcp_project_id, image_name, build_script_path,
+               dockerfile_dir, build_type):
+    """Args:
+
+      image_name: The docker image name
+      tag_name: The additional tag name. This is the name used when pushing the
+        docker image to GKE registry
+      build_script_path: The path to the build script that builds this docker
+      image
+      dockerfile_dir: The name of the directory under
+      '<grpc_root>/tools/dockerfile' that contains the dockerfile
+    """
+    self.image_name = image_name
+    self.gcp_project_id = gcp_project_id
+    self.build_script_path = build_script_path
+    self.dockerfile_dir = dockerfile_dir
+    self.build_type = build_type
+    self.tag_name = self._make_tag_name(gcp_project_id, image_name)
+
+  def _make_tag_name(self, project_id, image_name):
+    return 'gcr.io/%s/%s' % (project_id, image_name)
+
+  def build_image(self):
+    print 'Building docker image: %s (tag: %s)' % (self.image_name,
+                                                   self.tag_name)
+    os.environ['INTEROP_IMAGE'] = self.image_name
+    os.environ['INTEROP_IMAGE_REPOSITORY_TAG'] = self.tag_name
+    os.environ['BASE_NAME'] = self.dockerfile_dir
+    os.environ['BUILD_TYPE'] = self.build_type
+    print 'DEBUG: path: ', self.build_script_path
+    if subprocess.call(args=[self.build_script_path]) != 0:
+      print 'Error in building the Docker image'
+      return False
+    return True
+
+  def push_to_gke_registry(self):
+    cmd = ['gcloud', 'docker', 'push', self.tag_name]
+    print 'Pushing %s to the GKE registry..' % self.tag_name
+    if subprocess.call(args=cmd) != 0:
+      print 'Error in pushing the image %s to the GKE registry' % self.tag_name
+      return False
+    return True
+
+
+class ServerPodSpec:
+  """ Contains the information required to launch server pods. """
+
+  def __init__(self, name, server_template, docker_image, num_instances):
+    self.name = name
+    self.template = server_template
+    self.docker_image = docker_image
+    self.num_instances = num_instances
+
+  def pod_names(self):
+    """ Return a list of names of server pods to create. """
+    return ['%s-%d' % (self.name, i) for i in range(1, self.num_instances + 1)]
+
+  def server_addresses(self):
+    """ Return string of server addresses in the following format:
+      '<server_pod_name_1>:<server_port>,<server_pod_name_2>:<server_port>...'
+    """
+    return ','.join(['%s:%d' % (pod_name, self.template.server_port)
+                     for pod_name in self.pod_names()])
+
+
+class ClientPodSpec:
+  """ Contains the information required to launch client pods """
+
+  def __init__(self, name, client_template, docker_image, num_instances,
+               server_addresses):
+    self.name = name
+    self.template = client_template
+    self.docker_image = docker_image
+    self.num_instances = num_instances
+    self.server_addresses = server_addresses
+
+  def pod_names(self):
+    """ Return a list of names of client pods to create """
+    return ['%s-%d' % (self.name, i) for i in range(1, self.num_instances + 1)]
+
+  # The client args in the template do not have server addresses. This function
+  # adds the server addresses and returns the updated client args
+  def get_client_args_dict(self):
+    args_dict = self.template.client_args_dict.copy()
+    args_dict['server_addresses'] = self.server_addresses
+    return args_dict
+
+
+class Gke:
+  """ Class that has helper methods to interact with GKE """
+
+  class KubernetesProxy:
+    """Class to start a proxy on localhost to talk to the Kubernetes API server"""
+
+    def __init__(self, port):
+      cmd = ['kubectl', 'proxy', '--port=%d' % port]
+      self.p = subprocess.Popen(args=cmd)
+      time.sleep(2)
+      print '\nStarted kubernetes proxy on port: %d' % port
+
+    def __del__(self):
+      if self.p is not None:
+        print 'Shutting down Kubernetes proxy..'
+        self.p.kill()
+
+  def __init__(self, project_id, run_id, dataset_id, summary_table_id,
+               qps_table_id, kubernetes_port):
+    self.project_id = project_id
+    self.run_id = run_id
+    self.dataset_id = dataset_id
+    self.summary_table_id = summary_table_id
+    self.qps_table_id = qps_table_id
+
+    # The environment variables we would like to pass to every pod (both client
+    # and server) launched in GKE
+    self.gke_env = {
+        'RUN_ID': self.run_id,
+        'GCP_PROJECT_ID': self.project_id,
+        'DATASET_ID': self.dataset_id,
+        'SUMMARY_TABLE_ID': self.summary_table_id,
+        'QPS_TABLE_ID': self.qps_table_id
+    }
+
+    self.kubernetes_port = kubernetes_port
+    # Start kubernetes proxy
+    self.kubernetes_proxy = Gke.KubernetesProxy(kubernetes_port)
+
+  def _args_dict_to_str(self, args_dict):
+    return ' '.join('--%s=%s' % (k, args_dict[k]) for k in args_dict.keys())
+
+  def launch_servers(self, server_pod_spec):
+    is_success = True
+
+    # The command to run inside the container is the wrapper script (which then
+    # launches the actual server)
+    container_cmd = server_pod_spec.template.wrapper_script_path
+
+    # The parameters to the wrapper script (defined in
+    # server_pod_spec.template.wrapper_script_path) are are injected into the
+    # container via environment variables
+    server_env = self.gke_env.copy()
+    server_env.update({
+        'STRESS_TEST_IMAGE_TYPE': 'SERVER',
+        'STRESS_TEST_IMAGE': server_pod_spec.template.server_image_path,
+        'STRESS_TEST_ARGS_STR': self._args_dict_to_str(
+            server_pod_spec.template.server_args_dict)
+    })
+
+    for pod_name in server_pod_spec.pod_names():
+      server_env['POD_NAME'] = pod_name
+      print 'Creating server: %s' % pod_name
+      is_success = kubernetes_api.create_pod_and_service(
+          'localhost',
+          self.kubernetes_port,
+          'default',  # Use 'default' namespace
+          pod_name,
+          server_pod_spec.docker_image.tag_name,
+          [server_pod_spec.template.server_port],  # Ports to expose on the pod
+          [container_cmd],
+          [],  # Args list is empty since we are passing all args via env variables
+          server_env,
+          True  # Headless = True for server to that GKE creates a DNS record for pod_name
+      )
+      if not is_success:
+        print 'Error in launching server: %s' % pod_name
+        break
+
+    if is_success:
+      print 'Successfully created server(s)'
+
+    return is_success
+
+  def launch_clients(self, client_pod_spec):
+    is_success = True
+
+    # The command to run inside the container is the wrapper script (which then
+    # launches the actual stress client)
+    container_cmd = client_pod_spec.template.wrapper_script_path
+
+    # The parameters to the wrapper script (defined in
+    # client_pod_spec.template.wrapper_script_path) are are injected into the
+    # container via environment variables
+    client_env = self.gke_env.copy()
+    client_env.update({
+        'STRESS_TEST_IMAGE_TYPE': 'CLIENT',
+        'STRESS_TEST_IMAGE': client_pod_spec.template.client_image_path,
+        'STRESS_TEST_ARGS_STR': self._args_dict_to_str(
+            client_pod_spec.get_client_args_dict()),
+        'METRICS_CLIENT_IMAGE':
+            client_pod_spec.template.metrics_client_image_path,
+        'METRICS_CLIENT_ARGS_STR': self._args_dict_to_str(
+            client_pod_spec.template.metrics_args_dict),
+        'POLL_INTERVAL_SECS': str(client_pod_spec.template.poll_interval_secs)
+    })
+
+    for pod_name in client_pod_spec.pod_names():
+      client_env['POD_NAME'] = pod_name
+      print 'Creating client: %s' % pod_name
+      is_success = kubernetes_api.create_pod_and_service(
+          'localhost',
+          self.kubernetes_port,
+          'default',  # default namespace,
+          pod_name,
+          client_pod_spec.docker_image.tag_name,
+          [client_pod_spec.template.metrics_port],  # Ports to expose on the pod
+          [container_cmd],
+          [],  # Empty args list since all args are passed via env variables
+          client_env,
+          False  # Client is not a headless service.
+      )
+
+      if not is_success:
+        print 'Error in launching client %s' % pod_name
+        break
+
+    if is_success:
+      print 'Successfully created all client(s)'
+
+    return is_success
+
+  def _delete_pods(self, pod_name_list):
+    is_success = True
+    for pod_name in pod_name_list:
+      print 'Deleting %s' % pod_name
+      is_success = kubernetes_api.delete_pod_and_service(
+          'localhost',
+          self.kubernetes_port,
+          'default',  # default namespace
+          pod_name)
+
+      if not is_success:
+        print 'Error in deleting pod %s' % pod_name
+        break
+
+    if is_success:
+      print 'Successfully deleted all pods'
+
+    return is_success
+
+  def delete_servers(self, server_pod_spec):
+    return self._delete_pods(server_pod_spec.pod_names())
+
+  def delete_clients(self, client_pod_spec):
+    return self._delete_pods(client_pod_spec.pod_names())
+
+
+class Config:
+
+  def __init__(self, config_filename, gcp_project_id):
+    print 'Loading configuration...'
+    config_dict = self._load_config(config_filename)
+
+    self.global_settings = self._parse_global_settings(config_dict,
+                                                       gcp_project_id)
+    self.docker_images_dict = self._parse_docker_images(
+        config_dict, self.global_settings.gcp_project_id)
+    self.client_templates_dict = self._parse_client_templates(config_dict)
+    self.server_templates_dict = self._parse_server_templates(config_dict)
+    self.server_pod_specs_dict = self._parse_server_pod_specs(
+        config_dict, self.docker_images_dict, self.server_templates_dict)
+    self.client_pod_specs_dict = self._parse_client_pod_specs(
+        config_dict, self.docker_images_dict, self.client_templates_dict,
+        self.server_pod_specs_dict)
+    print 'Loaded Configuaration.'
+
+  def _parse_global_settings(self, config_dict, gcp_project_id):
+    global_settings_dict = config_dict['globalSettings']
+    return GlobalSettings(gcp_project_id,
+                          global_settings_dict['buildDockerImages'],
+                          global_settings_dict['pollIntervalSecs'],
+                          global_settings_dict['testDurationSecs'],
+                          global_settings_dict['kubernetesProxyPort'],
+                          global_settings_dict['datasetIdNamePrefix'],
+                          global_settings_dict['summaryTableId'],
+                          global_settings_dict['qpsTableId'],
+                          global_settings_dict['podWarmupSecs'])
+
+  def _parse_docker_images(self, config_dict, gcp_project_id):
+    """Parses the 'dockerImages' section of the config file and returns a
+    Dictionary of 'DockerImage' objects keyed by docker image names"""
+    docker_images_dict = {}
+
+    docker_config_dict = config_dict['dockerImages']
+    for image_name in docker_config_dict.keys():
+      build_script_path = docker_config_dict[image_name]['buildScript']
+      dockerfile_dir = docker_config_dict[image_name]['dockerFileDir']
+      build_type = docker_config_dict[image_name]['buildType']
+      docker_images_dict[image_name] = DockerImage(gcp_project_id, image_name,
+                                                   build_script_path,
+                                                   dockerfile_dir, build_type)
+    return docker_images_dict
+
+  def _parse_client_templates(self, config_dict):
+    """Parses the 'clientTemplates' section of the config file and returns a
+    Dictionary of 'ClientTemplate' objects keyed by client template names.
+
+    Note: The 'baseTemplates' sub section of the config file contains templates
+    with default values  and the 'templates' sub section contains the actual
+    client templates (which refer to the base template name to use for default
+    values).
+    """
+    client_templates_dict = {}
+
+    templates_dict = config_dict['clientTemplates']['templates']
+    base_templates_dict = config_dict['clientTemplates'].get('baseTemplates',
+                                                             {})
+    for template_name in templates_dict.keys():
+      # temp_dict is a temporary dictionary that merges base template dictionary
+      # and client template dictionary (with client template dictionary values
+      # overriding base template values)
+      temp_dict = {}
+
+      base_template_name = templates_dict[template_name].get('baseTemplate')
+      if not base_template_name is None:
+        temp_dict = base_templates_dict[base_template_name].copy()
+
+      temp_dict.update(templates_dict[template_name])
+
+      # Create and add ClientTemplate object to the final client_templates_dict
+      client_templates_dict[template_name] = ClientTemplate(
+          template_name, temp_dict['clientImagePath'],
+          temp_dict['metricsClientImagePath'], temp_dict['metricsPort'],
+          temp_dict['wrapperScriptPath'], temp_dict['pollIntervalSecs'],
+          temp_dict['clientArgs'].copy(), temp_dict['metricsArgs'].copy())
+
+    return client_templates_dict
+
+  def _parse_server_templates(self, config_dict):
+    """Parses the 'serverTemplates' section of the config file and returns a
+    Dictionary of 'serverTemplate' objects keyed by server template names.
+
+    Note: The 'baseTemplates' sub section of the config file contains templates
+    with default values  and the 'templates' sub section contains the actual
+    server templates (which refer to the base template name to use for default
+    values).
+    """
+    server_templates_dict = {}
+
+    templates_dict = config_dict['serverTemplates']['templates']
+    base_templates_dict = config_dict['serverTemplates'].get('baseTemplates',
+                                                             {})
+
+    for template_name in templates_dict.keys():
+      # temp_dict is a temporary dictionary that merges base template dictionary
+      # and server template dictionary (with server template dictionary values
+      # overriding base template values)
+      temp_dict = {}
+
+      base_template_name = templates_dict[template_name].get('baseTemplate')
+      if not base_template_name is None:
+        temp_dict = base_templates_dict[base_template_name].copy()
+
+      temp_dict.update(templates_dict[template_name])
+
+      # Create and add ServerTemplate object to the final server_templates_dict
+      server_templates_dict[template_name] = ServerTemplate(
+          template_name, temp_dict['serverImagePath'],
+          temp_dict['wrapperScriptPath'], temp_dict['serverPort'],
+          temp_dict['serverArgs'].copy())
+
+    return server_templates_dict
+
+  def _parse_server_pod_specs(self, config_dict, docker_images_dict,
+                              server_templates_dict):
+    """Parses the 'serverPodSpecs' sub-section (under 'testMatrix' section) of
+    the config file and returns a Dictionary of 'ServerPodSpec' objects keyed
+    by server pod spec names"""
+    server_pod_specs_dict = {}
+
+    pod_specs_dict = config_dict['testMatrix'].get('serverPodSpecs', {})
+
+    for pod_name in pod_specs_dict.keys():
+      server_template_name = pod_specs_dict[pod_name]['serverTemplate']
+      docker_image_name = pod_specs_dict[pod_name]['dockerImage']
+      num_instances = pod_specs_dict[pod_name].get('numInstances', 1)
+
+      # Create and add the ServerPodSpec object to the final
+      # server_pod_specs_dict
+      server_pod_specs_dict[pod_name] = ServerPodSpec(
+          pod_name, server_templates_dict[server_template_name],
+          docker_images_dict[docker_image_name], num_instances)
+
+    return server_pod_specs_dict
+
+  def _parse_client_pod_specs(self, config_dict, docker_images_dict,
+                              client_templates_dict, server_pod_specs_dict):
+    """Parses the 'clientPodSpecs' sub-section (under 'testMatrix' section) of
+    the config file and returns a Dictionary of 'ClientPodSpec' objects keyed
+    by client pod spec names"""
+    client_pod_specs_dict = {}
+
+    pod_specs_dict = config_dict['testMatrix'].get('clientPodSpecs', {})
+    for pod_name in pod_specs_dict.keys():
+      client_template_name = pod_specs_dict[pod_name]['clientTemplate']
+      docker_image_name = pod_specs_dict[pod_name]['dockerImage']
+      num_instances = pod_specs_dict[pod_name]['numInstances']
+
+      # Get the server addresses from the server pod spec object
+      server_pod_spec_name = pod_specs_dict[pod_name]['serverPodSpec']
+      server_addresses = server_pod_specs_dict[
+          server_pod_spec_name].server_addresses()
+
+      client_pod_specs_dict[pod_name] = ClientPodSpec(
+          pod_name, client_templates_dict[client_template_name],
+          docker_images_dict[docker_image_name], num_instances,
+          server_addresses)
+
+    return client_pod_specs_dict
+
+  def _load_config(self, config_filename):
+    """Opens the config file and converts the Json text to Dictionary"""
+    if not os.path.isabs(config_filename):
+      raise Exception('Config objects expects an absolute file path. '
+                      'config file name passed: %s' % config_filename)
+    with open(config_filename) as config_file:
+      return json.load(config_file)
+
+
+def run_tests(config):
+  """ The main function that launches the stress tests """
+  # Build docker images and push to GKE registry
+  if config.global_settings.build_docker_images:
+    for name, docker_image in config.docker_images_dict.iteritems():
+      if not (docker_image.build_image() and
+              docker_image.push_to_gke_registry()):
+        return False
+
+  # Create a unique id for this run (Note: Using timestamp instead of UUID to
+  # make it easier to deduce the date/time of the run just by looking at the run
+  # run id. This is useful in debugging when looking at records in Biq query)
+  run_id = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
+  dataset_id = '%s_%s' % (config.global_settings.dataset_id_prefix, run_id)
+
+  bq_helper = BigQueryHelper(run_id, '', '',
+                             config.global_settings.gcp_project_id, dataset_id,
+                             config.global_settings.summary_table_id,
+                             config.global_settings.qps_table_id)
+  bq_helper.initialize()
+
+  gke = Gke(config.global_settings.gcp_project_id, run_id, dataset_id,
+            config.global_settings.summary_table_id,
+            config.global_settings.qps_table_id,
+            config.global_settings.kubernetes_proxy_port)
+
+  is_success = True
+
+  try:
+    print 'Launching servers..'
+    for name, server_pod_spec in config.server_pod_specs_dict.iteritems():
+      if not gke.launch_servers(server_pod_spec):
+        is_success = False  # is_success is checked in the 'finally' block
+        return False
+
+    print('Launched servers. Waiting for %d seconds for the server pods to be '
+          'fully online') % config.global_settings.pod_warmup_secs
+    time.sleep(config.global_settings.pod_warmup_secs)
+
+    for name, client_pod_spec in config.client_pod_specs_dict.iteritems():
+      if not gke.launch_clients(client_pod_spec):
+        is_success = False  # is_success is checked in the 'finally' block
+        return False
+
+    print('Launched all clients. Waiting for %d seconds for the client pods to '
+          'be fully online') % config.global_settings.pod_warmup_secs
+    time.sleep(config.global_settings.pod_warmup_secs)
+
+    start_time = datetime.datetime.now()
+    end_time = start_time + datetime.timedelta(
+        seconds=config.global_settings.test_duration_secs)
+    print 'Running the test until %s' % end_time.isoformat()
+
+    while True:
+      if datetime.datetime.now() > end_time:
+        print 'Test was run for %d seconds' % config.global_settings.test_duration_secs
+        break
+
+      # Check if either stress server or clients have failed (btw, the bq_helper
+      # monitors all the rows in the summary table and checks if any of them
+      # have a failure status)
+      if bq_helper.check_if_any_tests_failed():
+        is_success = False
+        print 'Some tests failed.'
+        break  # Don't 'return' here. We still want to call bq_helper to print qps/summary tables
+
+      # Tests running fine. Wait until next poll time to check the status
+      print 'Sleeping for %d seconds..' % config.global_settings.test_poll_interval_secs
+      time.sleep(config.global_settings.test_poll_interval_secs)
+
+    # Print BiqQuery tables
+    bq_helper.print_qps_records()
+    bq_helper.print_summary_records()
+
+  finally:
+    # If there was a test failure, we should not delete the pods since they
+    # would contain useful debug information (logs, core dumps etc)
+    if is_success:
+      for name, server_pod_spec in config.server_pod_specs_dict.iteritems():
+        gke.delete_servers(server_pod_spec)
+      for name, client_pod_spec in config.client_pod_specs_dict.iteritems():
+        gke.delete_clients(client_pod_spec)
+
+  return is_success
+
+
+argp = argparse.ArgumentParser(
+    description='Launch stress tests in GKE',
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+argp.add_argument('--gcp_project_id',
+                  required=True,
+                  help='The Google Cloud Platform Project Id')
+argp.add_argument('--config_file',
+                  required=True,
+                  type=str,
+                  help='The test config file')
+
+if __name__ == '__main__':
+  args = argp.parse_args()
+
+  config_filename = args.config_file
+
+  # Since we will be changing the current working directory to grpc root in the
+  # next step, we should check if the config filename path is a relative path
+  # (i.e a path relative to the current working directory) and if so, convert it
+  # to abosulte path
+  if not os.path.isabs(config_filename):
+    config_filename = os.path.abspath(config_filename)
+
+  config = Config(config_filename, args.gcp_project_id)
+
+  # Change current working directory to grpc root
+  # (This is important because all relative file paths in the config file are
+  # supposed to interpreted as relative to the GRPC root)
+  grpc_root = os.path.abspath(os.path.join(
+      os.path.dirname(sys.argv[0]), '../../..'))
+  os.chdir(grpc_root)
+
+  run_tests(config)
diff --git a/tools/run_tests/stress_test/run_stress_tests_on_gke.py b/tools/run_tests/stress_test/run_stress_tests_on_gke.py
deleted file mode 100755
index 634eb1aca5..0000000000
--- a/tools/run_tests/stress_test/run_stress_tests_on_gke.py
+++ /dev/null
@@ -1,556 +0,0 @@
-#!/usr/bin/env python2.7
-# Copyright 2015-2016, Google Inc.
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-#
-#     * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following disclaimer
-# in the documentation and/or other materials provided with the
-# distribution.
-#     * Neither the name of Google Inc. nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-import argparse
-import datetime
-import os
-import subprocess
-import sys
-import time
-
-stress_test_utils_dir = os.path.abspath(os.path.join(
-    os.path.dirname(__file__), '../../gcp/stress_test'))
-sys.path.append(stress_test_utils_dir)
-from stress_test_utils import BigQueryHelper
-
-kubernetes_api_dir = os.path.abspath(os.path.join(
-    os.path.dirname(__file__), '../../gcp/utils'))
-sys.path.append(kubernetes_api_dir)
-
-import kubernetes_api
-
-_GRPC_ROOT = os.path.abspath(os.path.join(
-    os.path.dirname(sys.argv[0]), '../../..'))
-os.chdir(_GRPC_ROOT)
-
-# num of seconds to wait for the GKE image to start and warmup
-_GKE_IMAGE_WARMUP_WAIT_SECS = 60
-
-_SERVER_POD_NAME = 'stress-server'
-_CLIENT_POD_NAME_PREFIX = 'stress-client'
-_DATASET_ID_PREFIX = 'stress_test'
-_SUMMARY_TABLE_ID = 'summary'
-_QPS_TABLE_ID = 'qps'
-
-_DEFAULT_DOCKER_IMAGE_NAME = 'grpc_stress_test'
-
-# The default port on which the kubernetes proxy server is started on localhost
-# (i.e kubectl proxy --port=<port>)
-_DEFAULT_KUBERNETES_PROXY_PORT = 8001
-
-# How frequently should the stress client wrapper script (running inside a GKE
-# container) poll the health of the stress client (also running inside the GKE
-# container) and upload metrics to BigQuery
-_DEFAULT_STRESS_CLIENT_POLL_INTERVAL_SECS = 60
-
-# The default setting for stress test server and client
-_DEFAULT_STRESS_SERVER_PORT = 8080
-_DEFAULT_METRICS_PORT = 8081
-_DEFAULT_TEST_CASES_STR = 'empty_unary:1,large_unary:1,client_streaming:1,server_streaming:1,empty_stream:1'
-_DEFAULT_NUM_CHANNELS_PER_SERVER = 5
-_DEFAULT_NUM_STUBS_PER_CHANNEL = 10
-_DEFAULT_METRICS_COLLECTION_INTERVAL_SECS = 30
-
-# Number of stress client instances to launch
-_DEFAULT_NUM_CLIENTS = 3
-
-# How frequently should this test monitor the health of Stress clients and
-# Servers running in GKE
-_DEFAULT_TEST_POLL_INTERVAL_SECS = 60
-
-# Default run time for this test (2 hour)
-_DEFAULT_TEST_DURATION_SECS = 7200
-
-# The number of seconds it would take a GKE pod to warm up (i.e get to 'Running'
-# state from the time of creation). Ideally this is something the test should
-# automatically determine by using Kubernetes API to poll the pods status.
-_DEFAULT_GKE_WARMUP_SECS = 60
-
-
-class KubernetesProxy:
-  """ Class to start a proxy on localhost to the Kubernetes API server """
-
-  def __init__(self, api_port):
-    self.port = api_port
-    self.p = None
-    self.started = False
-
-  def start(self):
-    cmd = ['kubectl', 'proxy', '--port=%d' % self.port]
-    self.p = subprocess.Popen(args=cmd)
-    self.started = True
-    time.sleep(2)
-    print '..Started'
-
-  def get_port(self):
-    return self.port
-
-  def is_started(self):
-    return self.started
-
-  def __del__(self):
-    if self.p is not None:
-      print 'Shutting down Kubernetes proxy..'
-      self.p.kill()
-
-
-class TestSettings:
-
-  def __init__(self, build_docker_image, test_poll_interval_secs,
-               test_duration_secs, kubernetes_proxy_port):
-    self.build_docker_image = build_docker_image
-    self.test_poll_interval_secs = test_poll_interval_secs
-    self.test_duration_secs = test_duration_secs
-    self.kubernetes_proxy_port = kubernetes_proxy_port
-
-
-class GkeSettings:
-
-  def __init__(self, project_id, docker_image_name):
-    self.project_id = project_id
-    self.docker_image_name = docker_image_name
-    self.tag_name = 'gcr.io/%s/%s' % (project_id, docker_image_name)
-
-
-class BigQuerySettings:
-
-  def __init__(self, run_id, dataset_id, summary_table_id, qps_table_id):
-    self.run_id = run_id
-    self.dataset_id = dataset_id
-    self.summary_table_id = summary_table_id
-    self.qps_table_id = qps_table_id
-
-
-class StressServerSettings:
-
-  def __init__(self, server_pod_name, server_port):
-    self.server_pod_name = server_pod_name
-    self.server_port = server_port
-
-
-class StressClientSettings:
-
-  def __init__(self, num_clients, client_pod_name_prefix, server_pod_name,
-               server_port, metrics_port, metrics_collection_interval_secs,
-               stress_client_poll_interval_secs, num_channels_per_server,
-               num_stubs_per_channel, test_cases_str):
-    self.num_clients = num_clients
-    self.client_pod_name_prefix = client_pod_name_prefix
-    self.server_pod_name = server_pod_name
-    self.server_port = server_port
-    self.metrics_port = metrics_port
-    self.metrics_collection_interval_secs = metrics_collection_interval_secs
-    self.stress_client_poll_interval_secs = stress_client_poll_interval_secs
-    self.num_channels_per_server = num_channels_per_server
-    self.num_stubs_per_channel = num_stubs_per_channel
-    self.test_cases_str = test_cases_str
-
-    # == Derived properties ==
-    # Note: Client can accept a list of server addresses (a comma separated list
-    # of 'server_name:server_port'). In this case, we only have one server
-    # address to pass
-    self.server_addresses = '%s.default.svc.cluster.local:%d' % (
-        server_pod_name, server_port)
-    self.client_pod_names_list = ['%s-%d' % (client_pod_name_prefix, i)
-                                  for i in range(1, num_clients + 1)]
-
-
-def _build_docker_image(image_name, tag_name):
-  """ Build the docker image and add tag it to the GKE repository """
-  print 'Building docker image: %s' % image_name
-  os.environ['INTEROP_IMAGE'] = image_name
-  os.environ['INTEROP_IMAGE_REPOSITORY_TAG'] = tag_name
-  # Note that 'BASE_NAME' HAS to be 'grpc_interop_stress_cxx' since the script
-  # build_interop_stress_image.sh invokes the following script:
-  #   tools/dockerfile/$BASE_NAME/build_interop_stress.sh
-  os.environ['BASE_NAME'] = 'grpc_interop_stress_cxx'
-  cmd = ['tools/jenkins/build_interop_stress_image.sh']
-  retcode = subprocess.call(args=cmd)
-  if retcode != 0:
-    print 'Error in building docker image'
-    return False
-  return True
-
-
-def _push_docker_image_to_gke_registry(docker_tag_name):
-  """Executes 'gcloud docker push <docker_tag_name>' to push the image to GKE registry"""
-  cmd = ['gcloud', 'docker', 'push', docker_tag_name]
-  print 'Pushing %s to GKE registry..' % docker_tag_name
-  retcode = subprocess.call(args=cmd)
-  if retcode != 0:
-    print 'Error in pushing docker image %s to the GKE registry' % docker_tag_name
-    return False
-  return True
-
-
-def _launch_server(gke_settings, stress_server_settings, bq_settings,
-                   kubernetes_proxy):
-  """ Launches a stress test server instance in GKE cluster """
-  if not kubernetes_proxy.is_started:
-    print 'Kubernetes proxy must be started before calling this function'
-    return False
-
-  # This is the wrapper script that is run in the container. This script runs
-  # the actual stress test server
-  server_cmd_list = ['/var/local/git/grpc/tools/gcp/stress_test/run_server.py']
-
-  # run_server.py does not take any args from the command line. The args are
-  # instead passed via environment variables (see server_env below)
-  server_arg_list = []
-
-  # The parameters to the script run_server.py are injected into the container
-  # via environment variables
-  server_env = {
-      'STRESS_TEST_IMAGE_TYPE': 'SERVER',
-      'STRESS_TEST_IMAGE': '/var/local/git/grpc/bins/opt/interop_server',
-      'STRESS_TEST_ARGS_STR': '--port=%s' % stress_server_settings.server_port,
-      'RUN_ID': bq_settings.run_id,
-      'POD_NAME': stress_server_settings.server_pod_name,
-      'GCP_PROJECT_ID': gke_settings.project_id,
-      'DATASET_ID': bq_settings.dataset_id,
-      'SUMMARY_TABLE_ID': bq_settings.summary_table_id,
-      'QPS_TABLE_ID': bq_settings.qps_table_id
-  }
-
-  # Launch Server
-  is_success = kubernetes_api.create_pod_and_service(
-      'localhost',
-      kubernetes_proxy.get_port(),
-      'default',  # Use 'default' namespace
-      stress_server_settings.server_pod_name,
-      gke_settings.tag_name,
-      [stress_server_settings.server_port],  # Port that should be exposed
-      server_cmd_list,
-      server_arg_list,
-      server_env,
-      True  # Headless = True for server. Since we want DNS records to be created by GKE
-  )
-
-  return is_success
-
-
-def _launch_client(gke_settings, stress_server_settings, stress_client_settings,
-                   bq_settings, kubernetes_proxy):
-  """ Launches a configurable number of stress test clients on GKE cluster """
-  if not kubernetes_proxy.is_started:
-    print 'Kubernetes proxy must be started before calling this function'
-    return False
-
-  stress_client_arg_list = [
-      '--server_addresses=%s' % stress_client_settings.server_addresses,
-      '--test_cases=%s' % stress_client_settings.test_cases_str,
-      '--num_stubs_per_channel=%d' %
-      stress_client_settings.num_stubs_per_channel
-  ]
-
-  # This is the wrapper script that is run in the container. This script runs
-  # the actual stress client
-  client_cmd_list = ['/var/local/git/grpc/tools/gcp/stress_test/run_client.py']
-
-  # run_client.py takes no args. All args are passed as env variables (see
-  # client_env)
-  client_arg_list = []
-
-  metrics_server_address = 'localhost:%d' % stress_client_settings.metrics_port
-  metrics_client_arg_list = [
-      '--metrics_server_address=%s' % metrics_server_address,
-      '--total_only=true'
-  ]
-
-  # The parameters to the script run_client.py are injected into the container
-  # via environment variables
-  client_env = {
-      'STRESS_TEST_IMAGE_TYPE': 'CLIENT',
-      'STRESS_TEST_IMAGE': '/var/local/git/grpc/bins/opt/stress_test',
-      'STRESS_TEST_ARGS_STR': ' '.join(stress_client_arg_list),
-      'METRICS_CLIENT_IMAGE': '/var/local/git/grpc/bins/opt/metrics_client',
-      'METRICS_CLIENT_ARGS_STR': ' '.join(metrics_client_arg_list),
-      'RUN_ID': bq_settings.run_id,
-      'POLL_INTERVAL_SECS':
-          str(stress_client_settings.stress_client_poll_interval_secs),
-      'GCP_PROJECT_ID': gke_settings.project_id,
-      'DATASET_ID': bq_settings.dataset_id,
-      'SUMMARY_TABLE_ID': bq_settings.summary_table_id,
-      'QPS_TABLE_ID': bq_settings.qps_table_id
-  }
-
-  for pod_name in stress_client_settings.client_pod_names_list:
-    client_env['POD_NAME'] = pod_name
-    is_success = kubernetes_api.create_pod_and_service(
-        'localhost',  # Since proxy is running on localhost
-        kubernetes_proxy.get_port(),
-        'default',  # default namespace
-        pod_name,
-        gke_settings.tag_name,
-        [stress_client_settings.metrics_port
-        ],  # Client pods expose metrics port
-        client_cmd_list,
-        client_arg_list,
-        client_env,
-        False  # Client is not a headless service
-    )
-    if not is_success:
-      print 'Error in launching client %s' % pod_name
-      return False
-
-  return True
-
-
-def _launch_server_and_client(gke_settings, stress_server_settings,
-                              stress_client_settings, bq_settings,
-                              kubernetes_proxy_port):
-  # Start kubernetes proxy
-  print 'Kubernetes proxy'
-  kubernetes_proxy = KubernetesProxy(kubernetes_proxy_port)
-  kubernetes_proxy.start()
-
-  print 'Launching server..'
-  is_success = _launch_server(gke_settings, stress_server_settings, bq_settings,
-                              kubernetes_proxy)
-  if not is_success:
-    print 'Error in launching server'
-    return False
-
-  # Server takes a while to start.
-  # TODO(sree) Use Kubernetes API to query the status of the server instead of
-  # sleeping
-  print 'Waiting for %s seconds for the server to start...' % _GKE_IMAGE_WARMUP_WAIT_SECS
-  time.sleep(_GKE_IMAGE_WARMUP_WAIT_SECS)
-
-  # Launch client
-  client_pod_name_prefix = 'stress-client'
-  is_success = _launch_client(gke_settings, stress_server_settings,
-                              stress_client_settings, bq_settings,
-                              kubernetes_proxy)
-
-  if not is_success:
-    print 'Error in launching client(s)'
-    return False
-
-  print 'Waiting for %s seconds for the client images to start...' % _GKE_IMAGE_WARMUP_WAIT_SECS
-  time.sleep(_GKE_IMAGE_WARMUP_WAIT_SECS)
-  return True
-
-
-def _delete_server_and_client(stress_server_settings, stress_client_settings,
-                              kubernetes_proxy_port):
-  kubernetes_proxy = KubernetesProxy(kubernetes_proxy_port)
-  kubernetes_proxy.start()
-
-  # Delete clients first
-  is_success = True
-  for pod_name in stress_client_settings.client_pod_names_list:
-    is_success = kubernetes_api.delete_pod_and_service(
-        'localhost', kubernetes_proxy_port, 'default', pod_name)
-    if not is_success:
-      return False
-
-  # Delete server
-  is_success = kubernetes_api.delete_pod_and_service(
-      'localhost', kubernetes_proxy_port, 'default',
-      stress_server_settings.server_pod_name)
-  return is_success
-
-
-def run_test_main(test_settings, gke_settings, stress_server_settings,
-                  stress_client_clients):
-  is_success = True
-
-  if test_settings.build_docker_image:
-    is_success = _build_docker_image(gke_settings.docker_image_name,
-                                     gke_settings.tag_name)
-    if not is_success:
-      return False
-
-    is_success = _push_docker_image_to_gke_registry(gke_settings.tag_name)
-    if not is_success:
-      return False
-
-  # Create a unique id for this run (Note: Using timestamp instead of UUID to
-  # make it easier to deduce the date/time of the run just by looking at the run
-  # run id. This is useful in debugging when looking at records in Biq query)
-  run_id = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
-  dataset_id = '%s_%s' % (_DATASET_ID_PREFIX, run_id)
-
-  # Big Query settings (common for both Stress Server and Client)
-  bq_settings = BigQuerySettings(run_id, dataset_id, _SUMMARY_TABLE_ID,
-                                 _QPS_TABLE_ID)
-
-  bq_helper = BigQueryHelper(run_id, '', '', args.project_id, dataset_id,
-                             _SUMMARY_TABLE_ID, _QPS_TABLE_ID)
-  bq_helper.initialize()
-
-  try:
-    is_success = _launch_server_and_client(gke_settings, stress_server_settings,
-                                           stress_client_settings, bq_settings,
-                                           test_settings.kubernetes_proxy_port)
-    if not is_success:
-      return False
-
-    start_time = datetime.datetime.now()
-    end_time = start_time + datetime.timedelta(
-        seconds=test_settings.test_duration_secs)
-    print 'Running the test until %s' % end_time.isoformat()
-
-    while True:
-      if datetime.datetime.now() > end_time:
-        print 'Test was run for %d seconds' % test_settings.test_duration_secs
-        break
-
-      # Check if either stress server or clients have failed
-      if bq_helper.check_if_any_tests_failed():
-        is_success = False
-        print 'Some tests failed.'
-        break
-
-      # Things seem to be running fine. Wait until next poll time to check the
-      # status
-      print 'Sleeping for %d seconds..' % test_settings.test_poll_interval_secs
-      time.sleep(test_settings.test_poll_interval_secs)
-
-    # Print BiqQuery tables
-    bq_helper.print_summary_records()
-    bq_helper.print_qps_records()
-
-  finally:
-    # If is_success is False at this point, it means that the stress tests were
-    # started successfully but failed while running the tests. In this case we
-    # do should not delete the pods (since they contain all the failure
-    # information)
-    if is_success:
-      _delete_server_and_client(stress_server_settings, stress_client_settings,
-                                test_settings.kubernetes_proxy_port)
-
-  return is_success
-
-
-argp = argparse.ArgumentParser(
-    description='Launch stress tests in GKE',
-    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-argp.add_argument('--project_id',
-                  required=True,
-                  help='The Google Cloud Platform Project Id')
-argp.add_argument('--num_clients',
-                  default=1,
-                  type=int,
-                  help='Number of client instances to start')
-argp.add_argument('--docker_image_name',
-                  default=_DEFAULT_DOCKER_IMAGE_NAME,
-                  help='The name of the docker image containing stress client '
-                  'and stress servers')
-argp.add_argument('--build_docker_image',
-                  dest='build_docker_image',
-                  action='store_true',
-                  help='Build a docker image and push to Google Container '
-                  'Registry')
-argp.add_argument('--do_not_build_docker_image',
-                  dest='build_docker_image',
-                  action='store_false',
-                  help='Do not build and push docker image to Google Container '
-                  'Registry')
-argp.set_defaults(build_docker_image=True)
-
-argp.add_argument('--test_poll_interval_secs',
-                  default=_DEFAULT_TEST_POLL_INTERVAL_SECS,
-                  type=int,
-                  help='How frequently should this script should monitor the '
-                  'health of stress clients and servers running in the GKE '
-                  'cluster')
-argp.add_argument('--test_duration_secs',
-                  default=_DEFAULT_TEST_DURATION_SECS,
-                  type=int,
-                  help='How long should this test be run')
-argp.add_argument('--kubernetes_proxy_port',
-                  default=_DEFAULT_KUBERNETES_PROXY_PORT,
-                  type=int,
-                  help='The port on which the kubernetes proxy (on localhost)'
-                  ' is started')
-argp.add_argument('--stress_server_port',
-                  default=_DEFAULT_STRESS_SERVER_PORT,
-                  type=int,
-                  help='The port on which the stress server (in GKE '
-                  'containers) listens')
-argp.add_argument('--stress_client_metrics_port',
-                  default=_DEFAULT_METRICS_PORT,
-                  type=int,
-                  help='The port on which the stress clients (in GKE '
-                  'containers) expose metrics')
-argp.add_argument('--stress_client_poll_interval_secs',
-                  default=_DEFAULT_STRESS_CLIENT_POLL_INTERVAL_SECS,
-                  type=int,
-                  help='How frequently should the stress client wrapper script'
-                  ' running inside GKE should monitor health of the actual '
-                  ' stress client process and upload the metrics to BigQuery')
-argp.add_argument('--stress_client_metrics_collection_interval_secs',
-                  default=_DEFAULT_METRICS_COLLECTION_INTERVAL_SECS,
-                  type=int,
-                  help='How frequently should metrics be collected in-memory on'
-                  ' the stress clients (running inside GKE containers). Note '
-                  'that this is NOT the same as the upload-to-BigQuery '
-                  'frequency. The metrics upload frequency is controlled by the'
-                  ' --stress_client_poll_interval_secs flag')
-argp.add_argument('--stress_client_num_channels_per_server',
-                  default=_DEFAULT_NUM_CHANNELS_PER_SERVER,
-                  type=int,
-                  help='The number of channels created to each server from a '
-                  'stress client')
-argp.add_argument('--stress_client_num_stubs_per_channel',
-                  default=_DEFAULT_NUM_STUBS_PER_CHANNEL,
-                  type=int,
-                  help='The number of stubs created per channel. This number '
-                  'indicates the max number of RPCs that can be made in '
-                  'parallel on each channel at any given time')
-argp.add_argument('--stress_client_test_cases',
-                  default=_DEFAULT_TEST_CASES_STR,
-                  help='List of test cases (with weights) to be executed by the'
-                  ' stress test client. The list is in the following format:\n'
-                  '  <testcase_1:w_1,<test_case2:w_2>..<testcase_n:w_n>\n'
-                  ' (Note: The weights do not have to add up to 100)')
-
-if __name__ == '__main__':
-  args = argp.parse_args()
-
-  test_settings = TestSettings(
-      args.build_docker_image, args.test_poll_interval_secs,
-      args.test_duration_secs, args.kubernetes_proxy_port)
-
-  gke_settings = GkeSettings(args.project_id, args.docker_image_name)
-
-  stress_server_settings = StressServerSettings(_SERVER_POD_NAME,
-                                                args.stress_server_port)
-  stress_client_settings = StressClientSettings(
-      args.num_clients, _CLIENT_POD_NAME_PREFIX, _SERVER_POD_NAME,
-      args.stress_server_port, args.stress_client_metrics_port,
-      args.stress_client_metrics_collection_interval_secs,
-      args.stress_client_poll_interval_secs,
-      args.stress_client_num_channels_per_server,
-      args.stress_client_num_stubs_per_channel, args.stress_client_test_cases)
-
-  run_test_main(test_settings, gke_settings, stress_server_settings,
-                stress_client_settings)
author	Jan Tattermusch <jtattermusch@users.noreply.github.com>	2016-03-29 09:05:43 -0700
committer	Jan Tattermusch <jtattermusch@users.noreply.github.com>	2016-03-29 09:05:43 -0700
commit	46962bb5f88fa84cecb6ef1246f3307a4a13f04f (patch)
tree	1342a5cd2d7c3b524fb55dfda685d3f85fc277dc /tools
parent	d2eb23974d62e7d6f2ed6486ba559e36f16b8081 (diff)
parent	e68ec434804949209329c5953b65ae31e920a620 (diff)