aboutsummaryrefslogtreecommitdiffhomepage
path: root/tools/run_tests/stress_test
diff options
context:
space:
mode:
authorGravatar Sree Kuchibhotla <sreek@google.com>2016-02-19 03:02:16 -0800
committerGravatar Sree Kuchibhotla <sreek@google.com>2016-02-24 11:15:53 -0800
commit559e45becd0a50bd6af850900abbb2b5759f8719 (patch)
tree3b8b052fd7d0b6bcff102601ef857c43d0b17ad5 /tools/run_tests/stress_test
parent44ca2c26409a172b80bc9f40f7578f3eaf1d135d (diff)
Scripts to launch stress tests in GKE
Diffstat (limited to 'tools/run_tests/stress_test')
-rwxr-xr-xtools/run_tests/stress_test/run_client.py188
-rwxr-xr-xtools/run_tests/stress_test/run_server.py115
-rwxr-xr-xtools/run_tests/stress_test/stress_test_utils.py192
3 files changed, 495 insertions, 0 deletions
diff --git a/tools/run_tests/stress_test/run_client.py b/tools/run_tests/stress_test/run_client.py
new file mode 100755
index 0000000000..33958bce49
--- /dev/null
+++ b/tools/run_tests/stress_test/run_client.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python2.7
+# Copyright 2015-2016, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import datetime
+import os
+import re
+import select
+import subprocess
+import sys
+import time
+
+from stress_test_utils import EventType
+from stress_test_utils import BigQueryHelper
+
+
+# TODO (sree): Write a python grpc client to directly query the metrics instead
+# of calling metrics_client
+def _get_qps(metrics_cmd):
+ qps = 0
+ try:
+ # Note: gpr_log() writes even non-error messages to stderr stream. So it is
+ # important that we set stderr=subprocess.STDOUT
+ p = subprocess.Popen(args=metrics_cmd,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.STDOUT)
+ retcode = p.wait()
+ (out_str, err_str) = p.communicate()
+ if retcode != 0:
+ print 'Error in reading metrics information'
+ print 'Output: ', out_str
+ else:
+ # The overall qps is printed at the end of the line
+ m = re.search('\d+$', out_str)
+ qps = int(m.group()) if m else 0
+ except Exception as ex:
+ print 'Exception while reading metrics information: ' + str(ex)
+ return qps
+
+
+def run_client():
+ """This is a wrapper around the stress test client and performs the following:
+ 1) Create the following two tables in Big Query:
+ (i) Summary table: To record events like the test started, completed
+ successfully or failed
+ (ii) Qps table: To periodically record the QPS sent by this client
+ 2) Start the stress test client and add a row in the Big Query summary
+ table
+ 3) Once every few seconds (as specificed by the poll_interval_secs) poll
+ the status of the stress test client process and perform the
+ following:
+ 3.1) If the process is still running, get the current qps by invoking
+ the metrics client program and add a row in the Big Query
+ Qps table. Sleep for a duration specified by poll_interval_secs
+ 3.2) If the process exited successfully, add a row in the Big Query
+ Summary table and exit
+ 3.3) If the process failed, add a row in Big Query summary table and
+ wait forever.
+ NOTE: This script typically runs inside a GKE pod which means
+ that the pod gets destroyed when the script exits. However, in
+ case the stress test client fails, we would not want the pod to
+ be destroyed (since we might want to connect to the pod for
+ examining logs). This is the reason why the script waits forever
+ in case of failures
+ """
+ env = dict(os.environ)
+ image_type = env['STRESS_TEST_IMAGE_TYPE']
+ image_name = env['STRESS_TEST_IMAGE']
+ args_str = env['STRESS_TEST_ARGS_STR']
+ metrics_client_image = env['METRICS_CLIENT_IMAGE']
+ metrics_client_args_str = env['METRICS_CLIENT_ARGS_STR']
+ run_id = env['RUN_ID']
+ pod_name = env['POD_NAME']
+ logfile_name = env.get('LOGFILE_NAME')
+ poll_interval_secs = float(env['POLL_INTERVAL_SECS'])
+ project_id = env['GCP_PROJECT_ID']
+ dataset_id = env['DATASET_ID']
+ summary_table_id = env['SUMMARY_TABLE_ID']
+ qps_table_id = env['QPS_TABLE_ID']
+
+ bq_helper = BigQueryHelper(run_id, image_type, pod_name, project_id,
+ dataset_id, summary_table_id, qps_table_id)
+ bq_helper.initialize()
+
+ # Create BigQuery Dataset and Tables: Summary Table and Metrics Table
+ if not bq_helper.setup_tables():
+ print 'Error in creating BigQuery tables'
+ return
+
+ start_time = datetime.datetime.now()
+
+ logfile = None
+ details = 'Logging to stdout'
+ if logfile_name is not None:
+ print 'Opening logfile: %s ...' % logfile_name
+ details = 'Logfile: %s' % logfile_name
+ logfile = open(logfile_name, 'w')
+
+ # Update status that the test is starting (in the status table)
+ bq_helper.insert_summary_row(EventType.STARTING, details)
+
+ metrics_cmd = [metrics_client_image
+ ] + [x for x in metrics_client_args_str.split()]
+ stress_cmd = [image_name] + [x for x in args_str.split()]
+
+ print 'Launching process %s ...' % stress_cmd
+ stress_p = subprocess.Popen(args=stress_cmd,
+ stdout=logfile,
+ stderr=subprocess.STDOUT)
+
+ qps_history = [1, 1, 1] # Maintain the last 3 qps readings
+ qps_history_idx = 0 # Index into the qps_history list
+
+ is_error = False
+ while True:
+ # Check if stress_client is still running. If so, collect metrics and upload
+ # to BigQuery status table
+ if stress_p.poll() is not None:
+ # TODO(sree) Upload completion status to BigQuery
+ end_time = datetime.datetime.now().isoformat()
+ event_type = EventType.SUCCESS
+ details = 'End time: %s' % end_time
+ if stress_p.returncode != 0:
+ event_type = EventType.FAILURE
+ details = 'Return code = %d. End time: %s' % (stress_p.returncode,
+ end_time)
+ is_error = True
+ bq_helper.insert_summary_row(event_type, details)
+ print details
+ break
+
+ # Stress client still running. Get metrics
+ qps = _get_qps(metrics_cmd)
+ qps_recorded_at = datetime.datetime.now().isoformat()
+ print 'qps: %d at %s' % (qps, qps_recorded_at)
+
+ # If QPS has been zero for the last 3 iterations, flag it as error and exit
+ qps_history[qps_history_idx] = qps
+ qps_history_idx = (qps_history_idx + 1) % len(qps_history)
+ if sum(qps_history) == 0:
+ details = 'QPS has been zero for the last %d seconds - as of : %s' % (
+ poll_interval_secs * 3, qps_recorded_at)
+ is_error = True
+ bq_helper.insert_summary_row(EventType.FAILURE, details)
+ print details
+ break
+
+ # Upload qps metrics to BiqQuery
+ bq_helper.insert_qps_row(qps, qps_recorded_at)
+
+ time.sleep(poll_interval_secs)
+
+ if is_error:
+ print 'Waiting indefinitely..'
+ select.select([], [], [])
+
+ print 'Completed'
+ return
+
+
+if __name__ == '__main__':
+ run_client()
diff --git a/tools/run_tests/stress_test/run_server.py b/tools/run_tests/stress_test/run_server.py
new file mode 100755
index 0000000000..9ad8d63638
--- /dev/null
+++ b/tools/run_tests/stress_test/run_server.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python2.7
+# Copyright 2015-2016, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import datetime
+import os
+import select
+import subprocess
+import sys
+import time
+
+from stress_test_utils import BigQueryHelper
+from stress_test_utils import EventType
+
+
+def run_server():
+ """This is a wrapper around the interop server and performs the following:
+ 1) Create a 'Summary table' in Big Query to record events like the server
+ started, completed successfully or failed. NOTE: This also creates
+ another table called the QPS table which is currently NOT needed on the
+ server (it is needed on the stress test clients)
+ 2) Start the server process and add a row in Big Query summary table
+ 3) Wait for the server process to terminate. The server process does not
+ terminate unless there is an error.
+ If the server process terminated with a failure, add a row in Big Query
+ and wait forever.
+ NOTE: This script typically runs inside a GKE pod which means that the
+ pod gets destroyed when the script exits. However, in case the server
+ process fails, we would not want the pod to be destroyed (since we
+ might want to connect to the pod for examining logs). This is the
+ reason why the script waits forever in case of failures.
+ """
+
+ # Read the parameters from environment variables
+ env = dict(os.environ)
+
+ run_id = env['RUN_ID'] # The unique run id for this test
+ image_type = env['STRESS_TEST_IMAGE_TYPE']
+ image_name = env['STRESS_TEST_IMAGE']
+ args_str = env['STRESS_TEST_ARGS_STR']
+ pod_name = env['POD_NAME']
+ project_id = env['GCP_PROJECT_ID']
+ dataset_id = env['DATASET_ID']
+ summary_table_id = env['SUMMARY_TABLE_ID']
+ qps_table_id = env['QPS_TABLE_ID']
+
+ logfile_name = env.get('LOGFILE_NAME')
+
+ bq_helper = BigQueryHelper(run_id, image_type, pod_name, project_id,
+ dataset_id, summary_table_id, qps_table_id)
+ bq_helper.initialize()
+
+ # Create BigQuery Dataset and Tables: Summary Table and Metrics Table
+ if not bq_helper.setup_tables():
+ print 'Error in creating BigQuery tables'
+ return
+
+ start_time = datetime.datetime.now()
+
+ logfile = None
+ details = 'Logging to stdout'
+ if logfile_name is not None:
+ print 'Opening log file: ', logfile_name
+ logfile = open(logfile_name, 'w')
+ details = 'Logfile: %s' % logfile_name
+
+ # Update status that the test is starting (in the status table)
+ bq_helper.insert_summary_row(EventType.STARTING, details)
+
+ stress_cmd = [image_name] + [x for x in args_str.split()]
+
+ print 'Launching process %s ...' % stress_cmd
+ stress_p = subprocess.Popen(args=stress_cmd,
+ stdout=logfile,
+ stderr=subprocess.STDOUT)
+
+ returncode = stress_p.wait()
+ if returncode != 0:
+ end_time = datetime.datetime.now().isoformat()
+ event_type = EventType.FAILURE
+ details = 'Returncode: %d; End time: %s' % (returncode, end_time)
+ bq_helper.insert_summary_row(event_type, details)
+ print 'Waiting indefinitely..'
+ select.select([], [], [])
+ return returncode
+
+
+if __name__ == '__main__':
+ run_server()
diff --git a/tools/run_tests/stress_test/stress_test_utils.py b/tools/run_tests/stress_test/stress_test_utils.py
new file mode 100755
index 0000000000..a0626ce3ac
--- /dev/null
+++ b/tools/run_tests/stress_test/stress_test_utils.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python2.7
+# Copyright 2015-2016, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import datetime
+import json
+import os
+import re
+import select
+import subprocess
+import sys
+import time
+
+# Import big_query_utils module
+bq_utils_dir = os.path.abspath(os.path.join(
+ os.path.dirname(__file__), '../../big_query'))
+sys.path.append(bq_utils_dir)
+import big_query_utils as bq_utils
+
+class EventType:
+ STARTING = 'STARTING'
+ SUCCESS = 'SUCCESS'
+ FAILURE = 'FAILURE'
+
+class BigQueryHelper:
+ """Helper class for the stress test wrappers to interact with BigQuery.
+ """
+
+ def __init__(self, run_id, image_type, pod_name, project_id, dataset_id,
+ summary_table_id, qps_table_id):
+ self.run_id = run_id
+ self.image_type = image_type
+ self.pod_name = pod_name
+ self.project_id = project_id
+ self.dataset_id = dataset_id
+ self.summary_table_id = summary_table_id
+ self.qps_table_id = qps_table_id
+
+ def initialize(self):
+ self.bq = bq_utils.create_big_query()
+
+ def setup_tables(self):
+ return bq_utils.create_dataset(self.bq, self.project_id, self.dataset_id) \
+ and self.__create_summary_table() \
+ and self.__create_qps_table()
+
+ def insert_summary_row(self, event_type, details):
+ row_values_dict = {
+ 'run_id': self.run_id,
+ 'image_type': self.image_type,
+ 'pod_name': self.pod_name,
+ 'event_date': datetime.datetime.now().isoformat(),
+ 'event_type': event_type,
+ 'details': details
+ }
+ # Something that uniquely identifies the row (Biquery needs it for duplicate
+ # detection).
+ row_unique_id = '%s_%s_%s' % (self.run_id, self.pod_name, event_type)
+
+ row = bq_utils.make_row(row_unique_id, row_values_dict)
+ return bq_utils.insert_rows(self.bq, self.project_id, self.dataset_id,
+ self.summary_table_id, [row])
+
+ def insert_qps_row(self, qps, recorded_at):
+ row_values_dict = {
+ 'run_id': self.run_id,
+ 'pod_name': self.pod_name,
+ 'recorded_at': recorded_at,
+ 'qps': qps
+ }
+
+ row_unique_id = '%s_%s_%s' % (self.run_id, self.pod_name, recorded_at)
+ row = bq_utils.make_row(row_unique_id, row_values_dict)
+ return bq_utils.insert_rows(self.bq, self.project_id, self.dataset_id,
+ self.qps_table_id, [row])
+
+ def check_if_any_tests_failed(self, num_query_retries=3):
+ query = ('SELECT event_type FROM %s.%s WHERE run_id = %s AND '
+ 'event_type="%s"') % (self.dataset_id, self.summary_table_id,
+ self.run_id, EventType.FAILURE)
+ query_job = bq_utils.sync_query_job(self.bq, self.project_id, query)
+ page = self.bq.jobs().getQueryResults(**query_job['jobReference']).execute(
+ num_retries=num_query_retries)
+ print page
+ num_failures = int(page['totalRows'])
+ print 'num rows: ', num_failures
+ return num_failures > 0
+
+ def print_summary_records(self, num_query_retries=3):
+ line = '-' * 120
+ print line
+ print 'Summary records'
+ print 'Run Id', self.run_id
+ print line
+ query = ('SELECT pod_name, image_type, event_type, event_date, details'
+ ' FROM %s.%s WHERE run_id = %s ORDER by event_date;') % (
+ self.dataset_id, self.summary_table_id, self.run_id)
+ query_job = bq_utils.sync_query_job(self.bq, self.project_id, query)
+
+ print '{:<25} {:<12} {:<12} {:<30} {}'.format(
+ 'Pod name', 'Image type', 'Event type', 'Date', 'Details')
+ print line
+ page_token = None
+ while True:
+ page = self.bq.jobs().getQueryResults(
+ pageToken=page_token,
+ **query_job['jobReference']).execute(num_retries=num_query_retries)
+ rows = page.get('rows', [])
+ for row in rows:
+ print '{:<25} {:<12} {:<12} {:<30} {}'.format(
+ row['f'][0]['v'], row['f'][1]['v'], row['f'][2]['v'],
+ row['f'][3]['v'], row['f'][4]['v'])
+ page_token = page.get('pageToken')
+ if not page_token:
+ break
+
+ def print_qps_records(self, num_query_retries=3):
+ line = '-' * 80
+ print line
+ print 'QPS Summary'
+ print 'Run Id: ', self.run_id
+ print line
+ query = (
+ 'SELECT pod_name, recorded_at, qps FROM %s.%s WHERE run_id = %s ORDER '
+ 'by recorded_at;') % (self.dataset_id, self.qps_table_id, self.run_id)
+ query_job = bq_utils.sync_query_job(self.bq, self.project_id, query)
+ print '{:<25} {:30} {}'.format('Pod name', 'Recorded at', 'Qps')
+ print line
+ page_token = None
+ while True:
+ page = self.bq.jobs().getQueryResults(
+ pageToken=page_token,
+ **query_job['jobReference']).execute(num_retries=num_query_retries)
+ rows = page.get('rows', [])
+ for row in rows:
+ print '{:<25} {:30} {}'.format(row['f'][0]['v'], row['f'][1]['v'],
+ row['f'][2]['v'])
+ page_token = page.get('pageToken')
+ if not page_token:
+ break
+
+ def __create_summary_table(self):
+ summary_table_schema = [
+ ('run_id', 'INTEGER', 'Test run id'),
+ ('image_type', 'STRING', 'Client or Server?'),
+ ('pod_name', 'STRING', 'GKE pod hosting this image'),
+ ('event_date', 'STRING', 'The date of this event'),
+ ('event_type', 'STRING', 'STARTED/SUCCESS/FAILURE'),
+ ('details', 'STRING', 'Any other relevant details')
+ ]
+ desc = ('The table that contains START/SUCCESS/FAILURE events for '
+ ' the stress test clients and servers')
+ return bq_utils.create_table(self.bq, self.project_id, self.dataset_id,
+ self.summary_table_id, summary_table_schema,
+ desc)
+
+ def __create_qps_table(self):
+ qps_table_schema = [
+ ('run_id', 'INTEGER', 'Test run id'),
+ ('pod_name', 'STRING', 'GKE pod hosting this image'),
+ ('recorded_at', 'STRING', 'Metrics recorded at time'),
+ ('qps', 'INTEGER', 'Queries per second')
+ ]
+ desc = 'The table that cointains the qps recorded at various intervals'
+ return bq_utils.create_table(self.bq, self.project_id, self.dataset_id,
+ self.qps_table_id, qps_table_schema, desc)