From 559e45becd0a50bd6af850900abbb2b5759f8719 Mon Sep 17 00:00:00 2001 From: Sree Kuchibhotla Date: Fri, 19 Feb 2016 03:02:16 -0800 Subject: Scripts to launch stress tests in GKE --- tools/run_tests/stress_test/run_client.py | 188 ++++++++++++++++++++++ tools/run_tests/stress_test/run_server.py | 115 ++++++++++++++ tools/run_tests/stress_test/stress_test_utils.py | 192 +++++++++++++++++++++++ 3 files changed, 495 insertions(+) create mode 100755 tools/run_tests/stress_test/run_client.py create mode 100755 tools/run_tests/stress_test/run_server.py create mode 100755 tools/run_tests/stress_test/stress_test_utils.py (limited to 'tools/run_tests/stress_test') diff --git a/tools/run_tests/stress_test/run_client.py b/tools/run_tests/stress_test/run_client.py new file mode 100755 index 0000000000..33958bce49 --- /dev/null +++ b/tools/run_tests/stress_test/run_client.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python2.7 +# Copyright 2015-2016, Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import datetime +import os +import re +import select +import subprocess +import sys +import time + +from stress_test_utils import EventType +from stress_test_utils import BigQueryHelper + + +# TODO (sree): Write a python grpc client to directly query the metrics instead +# of calling metrics_client +def _get_qps(metrics_cmd): + qps = 0 + try: + # Note: gpr_log() writes even non-error messages to stderr stream. So it is + # important that we set stderr=subprocess.STDOUT + p = subprocess.Popen(args=metrics_cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + retcode = p.wait() + (out_str, err_str) = p.communicate() + if retcode != 0: + print 'Error in reading metrics information' + print 'Output: ', out_str + else: + # The overall qps is printed at the end of the line + m = re.search('\d+$', out_str) + qps = int(m.group()) if m else 0 + except Exception as ex: + print 'Exception while reading metrics information: ' + str(ex) + return qps + + +def run_client(): + """This is a wrapper around the stress test client and performs the following: + 1) Create the following two tables in Big Query: + (i) Summary table: To record events like the test started, completed + successfully or failed + (ii) Qps table: To periodically record the QPS sent by this client + 2) Start the stress test client and add a row in the Big Query summary + table + 3) Once every few seconds (as specificed by the poll_interval_secs) poll + the status of the stress test client process and perform the + following: + 3.1) If the process is still running, get the current qps by invoking + the metrics client program and add a row in the Big Query + Qps table. Sleep for a duration specified by poll_interval_secs + 3.2) If the process exited successfully, add a row in the Big Query + Summary table and exit + 3.3) If the process failed, add a row in Big Query summary table and + wait forever. + NOTE: This script typically runs inside a GKE pod which means + that the pod gets destroyed when the script exits. However, in + case the stress test client fails, we would not want the pod to + be destroyed (since we might want to connect to the pod for + examining logs). This is the reason why the script waits forever + in case of failures + """ + env = dict(os.environ) + image_type = env['STRESS_TEST_IMAGE_TYPE'] + image_name = env['STRESS_TEST_IMAGE'] + args_str = env['STRESS_TEST_ARGS_STR'] + metrics_client_image = env['METRICS_CLIENT_IMAGE'] + metrics_client_args_str = env['METRICS_CLIENT_ARGS_STR'] + run_id = env['RUN_ID'] + pod_name = env['POD_NAME'] + logfile_name = env.get('LOGFILE_NAME') + poll_interval_secs = float(env['POLL_INTERVAL_SECS']) + project_id = env['GCP_PROJECT_ID'] + dataset_id = env['DATASET_ID'] + summary_table_id = env['SUMMARY_TABLE_ID'] + qps_table_id = env['QPS_TABLE_ID'] + + bq_helper = BigQueryHelper(run_id, image_type, pod_name, project_id, + dataset_id, summary_table_id, qps_table_id) + bq_helper.initialize() + + # Create BigQuery Dataset and Tables: Summary Table and Metrics Table + if not bq_helper.setup_tables(): + print 'Error in creating BigQuery tables' + return + + start_time = datetime.datetime.now() + + logfile = None + details = 'Logging to stdout' + if logfile_name is not None: + print 'Opening logfile: %s ...' % logfile_name + details = 'Logfile: %s' % logfile_name + logfile = open(logfile_name, 'w') + + # Update status that the test is starting (in the status table) + bq_helper.insert_summary_row(EventType.STARTING, details) + + metrics_cmd = [metrics_client_image + ] + [x for x in metrics_client_args_str.split()] + stress_cmd = [image_name] + [x for x in args_str.split()] + + print 'Launching process %s ...' % stress_cmd + stress_p = subprocess.Popen(args=stress_cmd, + stdout=logfile, + stderr=subprocess.STDOUT) + + qps_history = [1, 1, 1] # Maintain the last 3 qps readings + qps_history_idx = 0 # Index into the qps_history list + + is_error = False + while True: + # Check if stress_client is still running. If so, collect metrics and upload + # to BigQuery status table + if stress_p.poll() is not None: + # TODO(sree) Upload completion status to BigQuery + end_time = datetime.datetime.now().isoformat() + event_type = EventType.SUCCESS + details = 'End time: %s' % end_time + if stress_p.returncode != 0: + event_type = EventType.FAILURE + details = 'Return code = %d. End time: %s' % (stress_p.returncode, + end_time) + is_error = True + bq_helper.insert_summary_row(event_type, details) + print details + break + + # Stress client still running. Get metrics + qps = _get_qps(metrics_cmd) + qps_recorded_at = datetime.datetime.now().isoformat() + print 'qps: %d at %s' % (qps, qps_recorded_at) + + # If QPS has been zero for the last 3 iterations, flag it as error and exit + qps_history[qps_history_idx] = qps + qps_history_idx = (qps_history_idx + 1) % len(qps_history) + if sum(qps_history) == 0: + details = 'QPS has been zero for the last %d seconds - as of : %s' % ( + poll_interval_secs * 3, qps_recorded_at) + is_error = True + bq_helper.insert_summary_row(EventType.FAILURE, details) + print details + break + + # Upload qps metrics to BiqQuery + bq_helper.insert_qps_row(qps, qps_recorded_at) + + time.sleep(poll_interval_secs) + + if is_error: + print 'Waiting indefinitely..' + select.select([], [], []) + + print 'Completed' + return + + +if __name__ == '__main__': + run_client() diff --git a/tools/run_tests/stress_test/run_server.py b/tools/run_tests/stress_test/run_server.py new file mode 100755 index 0000000000..9ad8d63638 --- /dev/null +++ b/tools/run_tests/stress_test/run_server.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python2.7 +# Copyright 2015-2016, Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import datetime +import os +import select +import subprocess +import sys +import time + +from stress_test_utils import BigQueryHelper +from stress_test_utils import EventType + + +def run_server(): + """This is a wrapper around the interop server and performs the following: + 1) Create a 'Summary table' in Big Query to record events like the server + started, completed successfully or failed. NOTE: This also creates + another table called the QPS table which is currently NOT needed on the + server (it is needed on the stress test clients) + 2) Start the server process and add a row in Big Query summary table + 3) Wait for the server process to terminate. The server process does not + terminate unless there is an error. + If the server process terminated with a failure, add a row in Big Query + and wait forever. + NOTE: This script typically runs inside a GKE pod which means that the + pod gets destroyed when the script exits. However, in case the server + process fails, we would not want the pod to be destroyed (since we + might want to connect to the pod for examining logs). This is the + reason why the script waits forever in case of failures. + """ + + # Read the parameters from environment variables + env = dict(os.environ) + + run_id = env['RUN_ID'] # The unique run id for this test + image_type = env['STRESS_TEST_IMAGE_TYPE'] + image_name = env['STRESS_TEST_IMAGE'] + args_str = env['STRESS_TEST_ARGS_STR'] + pod_name = env['POD_NAME'] + project_id = env['GCP_PROJECT_ID'] + dataset_id = env['DATASET_ID'] + summary_table_id = env['SUMMARY_TABLE_ID'] + qps_table_id = env['QPS_TABLE_ID'] + + logfile_name = env.get('LOGFILE_NAME') + + bq_helper = BigQueryHelper(run_id, image_type, pod_name, project_id, + dataset_id, summary_table_id, qps_table_id) + bq_helper.initialize() + + # Create BigQuery Dataset and Tables: Summary Table and Metrics Table + if not bq_helper.setup_tables(): + print 'Error in creating BigQuery tables' + return + + start_time = datetime.datetime.now() + + logfile = None + details = 'Logging to stdout' + if logfile_name is not None: + print 'Opening log file: ', logfile_name + logfile = open(logfile_name, 'w') + details = 'Logfile: %s' % logfile_name + + # Update status that the test is starting (in the status table) + bq_helper.insert_summary_row(EventType.STARTING, details) + + stress_cmd = [image_name] + [x for x in args_str.split()] + + print 'Launching process %s ...' % stress_cmd + stress_p = subprocess.Popen(args=stress_cmd, + stdout=logfile, + stderr=subprocess.STDOUT) + + returncode = stress_p.wait() + if returncode != 0: + end_time = datetime.datetime.now().isoformat() + event_type = EventType.FAILURE + details = 'Returncode: %d; End time: %s' % (returncode, end_time) + bq_helper.insert_summary_row(event_type, details) + print 'Waiting indefinitely..' + select.select([], [], []) + return returncode + + +if __name__ == '__main__': + run_server() diff --git a/tools/run_tests/stress_test/stress_test_utils.py b/tools/run_tests/stress_test/stress_test_utils.py new file mode 100755 index 0000000000..a0626ce3ac --- /dev/null +++ b/tools/run_tests/stress_test/stress_test_utils.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python2.7 +# Copyright 2015-2016, Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import datetime +import json +import os +import re +import select +import subprocess +import sys +import time + +# Import big_query_utils module +bq_utils_dir = os.path.abspath(os.path.join( + os.path.dirname(__file__), '../../big_query')) +sys.path.append(bq_utils_dir) +import big_query_utils as bq_utils + +class EventType: + STARTING = 'STARTING' + SUCCESS = 'SUCCESS' + FAILURE = 'FAILURE' + +class BigQueryHelper: + """Helper class for the stress test wrappers to interact with BigQuery. + """ + + def __init__(self, run_id, image_type, pod_name, project_id, dataset_id, + summary_table_id, qps_table_id): + self.run_id = run_id + self.image_type = image_type + self.pod_name = pod_name + self.project_id = project_id + self.dataset_id = dataset_id + self.summary_table_id = summary_table_id + self.qps_table_id = qps_table_id + + def initialize(self): + self.bq = bq_utils.create_big_query() + + def setup_tables(self): + return bq_utils.create_dataset(self.bq, self.project_id, self.dataset_id) \ + and self.__create_summary_table() \ + and self.__create_qps_table() + + def insert_summary_row(self, event_type, details): + row_values_dict = { + 'run_id': self.run_id, + 'image_type': self.image_type, + 'pod_name': self.pod_name, + 'event_date': datetime.datetime.now().isoformat(), + 'event_type': event_type, + 'details': details + } + # Something that uniquely identifies the row (Biquery needs it for duplicate + # detection). + row_unique_id = '%s_%s_%s' % (self.run_id, self.pod_name, event_type) + + row = bq_utils.make_row(row_unique_id, row_values_dict) + return bq_utils.insert_rows(self.bq, self.project_id, self.dataset_id, + self.summary_table_id, [row]) + + def insert_qps_row(self, qps, recorded_at): + row_values_dict = { + 'run_id': self.run_id, + 'pod_name': self.pod_name, + 'recorded_at': recorded_at, + 'qps': qps + } + + row_unique_id = '%s_%s_%s' % (self.run_id, self.pod_name, recorded_at) + row = bq_utils.make_row(row_unique_id, row_values_dict) + return bq_utils.insert_rows(self.bq, self.project_id, self.dataset_id, + self.qps_table_id, [row]) + + def check_if_any_tests_failed(self, num_query_retries=3): + query = ('SELECT event_type FROM %s.%s WHERE run_id = %s AND ' + 'event_type="%s"') % (self.dataset_id, self.summary_table_id, + self.run_id, EventType.FAILURE) + query_job = bq_utils.sync_query_job(self.bq, self.project_id, query) + page = self.bq.jobs().getQueryResults(**query_job['jobReference']).execute( + num_retries=num_query_retries) + print page + num_failures = int(page['totalRows']) + print 'num rows: ', num_failures + return num_failures > 0 + + def print_summary_records(self, num_query_retries=3): + line = '-' * 120 + print line + print 'Summary records' + print 'Run Id', self.run_id + print line + query = ('SELECT pod_name, image_type, event_type, event_date, details' + ' FROM %s.%s WHERE run_id = %s ORDER by event_date;') % ( + self.dataset_id, self.summary_table_id, self.run_id) + query_job = bq_utils.sync_query_job(self.bq, self.project_id, query) + + print '{:<25} {:<12} {:<12} {:<30} {}'.format( + 'Pod name', 'Image type', 'Event type', 'Date', 'Details') + print line + page_token = None + while True: + page = self.bq.jobs().getQueryResults( + pageToken=page_token, + **query_job['jobReference']).execute(num_retries=num_query_retries) + rows = page.get('rows', []) + for row in rows: + print '{:<25} {:<12} {:<12} {:<30} {}'.format( + row['f'][0]['v'], row['f'][1]['v'], row['f'][2]['v'], + row['f'][3]['v'], row['f'][4]['v']) + page_token = page.get('pageToken') + if not page_token: + break + + def print_qps_records(self, num_query_retries=3): + line = '-' * 80 + print line + print 'QPS Summary' + print 'Run Id: ', self.run_id + print line + query = ( + 'SELECT pod_name, recorded_at, qps FROM %s.%s WHERE run_id = %s ORDER ' + 'by recorded_at;') % (self.dataset_id, self.qps_table_id, self.run_id) + query_job = bq_utils.sync_query_job(self.bq, self.project_id, query) + print '{:<25} {:30} {}'.format('Pod name', 'Recorded at', 'Qps') + print line + page_token = None + while True: + page = self.bq.jobs().getQueryResults( + pageToken=page_token, + **query_job['jobReference']).execute(num_retries=num_query_retries) + rows = page.get('rows', []) + for row in rows: + print '{:<25} {:30} {}'.format(row['f'][0]['v'], row['f'][1]['v'], + row['f'][2]['v']) + page_token = page.get('pageToken') + if not page_token: + break + + def __create_summary_table(self): + summary_table_schema = [ + ('run_id', 'INTEGER', 'Test run id'), + ('image_type', 'STRING', 'Client or Server?'), + ('pod_name', 'STRING', 'GKE pod hosting this image'), + ('event_date', 'STRING', 'The date of this event'), + ('event_type', 'STRING', 'STARTED/SUCCESS/FAILURE'), + ('details', 'STRING', 'Any other relevant details') + ] + desc = ('The table that contains START/SUCCESS/FAILURE events for ' + ' the stress test clients and servers') + return bq_utils.create_table(self.bq, self.project_id, self.dataset_id, + self.summary_table_id, summary_table_schema, + desc) + + def __create_qps_table(self): + qps_table_schema = [ + ('run_id', 'INTEGER', 'Test run id'), + ('pod_name', 'STRING', 'GKE pod hosting this image'), + ('recorded_at', 'STRING', 'Metrics recorded at time'), + ('qps', 'INTEGER', 'Queries per second') + ] + desc = 'The table that cointains the qps recorded at various intervals' + return bq_utils.create_table(self.bq, self.project_id, self.dataset_id, + self.qps_table_id, qps_table_schema, desc) -- cgit v1.2.3