From 7d037a5ee2ebdba2fb8d0688399151878818c297 Mon Sep 17 00:00:00 2001 From: Sree Kuchibhotla Date: Fri, 12 Feb 2016 19:49:23 -0800 Subject: first version of bigquery python helpers --- tools/bigquery/big_query_utils.py | 181 +++++++++++++++++++++ .../dockerfile/grpc_interop_stress_cxx/Dockerfile | 5 + 2 files changed, 186 insertions(+) create mode 100644 tools/bigquery/big_query_utils.py (limited to 'tools') diff --git a/tools/bigquery/big_query_utils.py b/tools/bigquery/big_query_utils.py new file mode 100644 index 0000000000..ebcf9d6ec3 --- /dev/null +++ b/tools/bigquery/big_query_utils.py @@ -0,0 +1,181 @@ +import argparse +import json +import uuid +import httplib2 + +from apiclient import discovery +from apiclient.errors import HttpError +from oauth2client.client import GoogleCredentials + +NUM_RETRIES = 3 + + +def create_bq(): + """Authenticates with cloud platform and gets a BiqQuery service object + """ + creds = GoogleCredentials.get_application_default() + return discovery.build('bigquery', 'v2', credentials=creds) + + +def create_ds(biq_query, project_id, dataset_id): + is_success = True + body = { + 'datasetReference': { + 'projectId': project_id, + 'datasetId': dataset_id + } + } + try: + dataset_req = biq_query.datasets().insert(projectId=project_id, body=body) + dataset_req.execute(num_retries=NUM_RETRIES) + except HttpError as http_error: + if http_error.resp.status == 409: + print 'Warning: The dataset %s already exists' % dataset_id + else: + # Note: For more debugging info, print "http_error.content" + print 'Error in creating dataset: %s. Err: %s' % (dataset_id, http_error) + is_success = False + return is_success + + +def make_field(field_name, field_type, field_description): + return { + 'name': field_name, + 'type': field_type, + 'description': field_description + } + + +def create_table(big_query, project_id, dataset_id, table_id, fields_list, + description): + is_success = True + body = { + 'description': description, + 'schema': { + 'fields': fields_list + }, + 'tableReference': { + 'datasetId': dataset_id, + 'projectId': project_id, + 'tableId': table_id + } + } + try: + table_req = big_query.tables().insert(projectId=project_id, + datasetId=dataset_id, + body=body) + res = table_req.execute(num_retries=NUM_RETRIES) + print 'Successfully created %s "%s"' % (res['kind'], res['id']) + except HttpError as http_error: + if http_error.resp.status == 409: + print 'Warning: Table %s already exists' % table_id + else: + print 'Error in creating table: %s. Err: %s' % (table_id, http_error) + is_success = False + return is_success + + +def insert_rows(big_query, project_id, dataset_id, table_id, rows_list): + is_success = True + body = {'rows': rows_list} + try: + insert_req = big_query.tabledata().insertAll(projectId=project_id, + datasetId=dataset_id, + tableId=table_id, + body=body) + print body + res = insert_req.execute(num_retries=NUM_RETRIES) + print res + except HttpError as http_error: + print 'Error in inserting rows in the table %s' % table_id + is_success = False + return is_success + +##################### + + +def make_emp_row(emp_id, emp_name, emp_email): + return { + 'insertId': str(emp_id), + 'json': { + 'emp_id': emp_id, + 'emp_name': emp_name, + 'emp_email_id': emp_email + } + } + + +def get_emp_table_fields_list(): + return [ + make_field('emp_id', 'INTEGER', 'Employee id'), + make_field('emp_name', 'STRING', 'Employee name'), + make_field('emp_email_id', 'STRING', 'Employee email id') + ] + + +def insert_emp_rows(big_query, project_id, dataset_id, table_id, start_idx, + num_rows): + rows_list = [make_emp_row(i, 'sree_%d' % i, 'sreecha_%d@gmail.com' % i) + for i in range(start_idx, start_idx + num_rows)] + insert_rows(big_query, project_id, dataset_id, table_id, rows_list) + + +def create_emp_table(big_query, project_id, dataset_id, table_id): + fields_list = get_emp_table_fields_list() + description = 'Test table created by sree' + create_table(big_query, project_id, dataset_id, table_id, fields_list, + description) + + +def sync_query(big_query, project_id, query, timeout=5000): + query_data = {'query': query, 'timeoutMs': timeout} + query_job = None + try: + query_job = big_query.jobs().query( + projectId=project_id, + body=query_data).execute(num_retries=NUM_RETRIES) + except HttpError as http_error: + print 'Query execute job failed with error: %s' % http_error + print http_error.content + return query_job + +#[Start query_emp_records] +def query_emp_records(big_query, project_id, dataset_id, table_id): + query = 'SELECT emp_id, emp_name FROM %s.%s ORDER BY emp_id;' % (dataset_id, table_id) + print query + query_job = sync_query(big_query, project_id, query, 5000) + job_id = query_job['jobReference'] + + print query_job + print '**Starting paging **' + #[Start Paging] + page_token = None + while True: + page = big_query.jobs().getQueryResults( + pageToken=page_token, + **query_job['jobReference']).execute(num_retries=NUM_RETRIES) + rows = page['rows'] + for row in rows: + print row['f'][0]['v'], "---", row['f'][1]['v'] + page_token = page.get('pageToken') + if not page_token: + break + #[End Paging] +#[End query_emp_records] + +######################### +DATASET_SEQ_NUM = 1 +TABLE_SEQ_NUM = 11 + +PROJECT_ID = 'sree-gce' +DATASET_ID = 'sree_test_dataset_%d' % DATASET_SEQ_NUM +TABLE_ID = 'sree_test_table_%d' % TABLE_SEQ_NUM + +EMP_ROW_IDX = 10 +EMP_NUM_ROWS = 5 + +bq = create_bq() +create_ds(bq, PROJECT_ID, DATASET_ID) +create_emp_table(bq, PROJECT_ID, DATASET_ID, TABLE_ID) +insert_emp_rows(bq, PROJECT_ID, DATASET_ID, TABLE_ID, EMP_ROW_IDX, EMP_NUM_ROWS) +query_emp_records(bq, PROJECT_ID, DATASET_ID, TABLE_ID) diff --git a/tools/dockerfile/grpc_interop_stress_cxx/Dockerfile b/tools/dockerfile/grpc_interop_stress_cxx/Dockerfile index 58a8c32e34..4123cc1a26 100644 --- a/tools/dockerfile/grpc_interop_stress_cxx/Dockerfile +++ b/tools/dockerfile/grpc_interop_stress_cxx/Dockerfile @@ -59,6 +59,8 @@ RUN apt-get update && apt-get install -y \ wget \ zip && apt-get clean +RUN easy_install -U pip + # Prepare ccache RUN ln -s /usr/bin/ccache /usr/local/bin/gcc RUN ln -s /usr/bin/ccache /usr/local/bin/g++ @@ -71,5 +73,8 @@ RUN ln -s /usr/bin/ccache /usr/local/bin/clang++ # C++ dependencies RUN apt-get update && apt-get -y install libgflags-dev libgtest-dev libc++-dev clang +# Google Cloud platform API libraries (for BigQuery) +RUN pip install --upgrade google-api-python-client + # Define the default command. CMD ["bash"] -- cgit v1.2.3 From 44ca2c26409a172b80bc9f40f7578f3eaf1d135d Mon Sep 17 00:00:00 2001 From: Sree Kuchibhotla Date: Tue, 16 Feb 2016 09:48:36 -0800 Subject: Examples --- test/cpp/interop/metrics_client.cc | 41 +++-- tools/bigquery/big_query_utils.py | 181 --------------------- .../build_interop_stress.sh | 2 +- tools/gke/big_query_utils.py | 181 +++++++++++++++++++++ tools/gke/create_client.py | 108 ++++++++++++ tools/gke/create_server.py | 74 +++++++++ tools/gke/delete_client.py | 66 ++++++++ tools/gke/delete_server.py | 58 +++++++ tools/gke/kubernetes_api.py | 35 ++-- tools/run_tests/stress_test_wrapper.py | 96 +++++++++++ 10 files changed, 632 insertions(+), 210 deletions(-) delete mode 100644 tools/bigquery/big_query_utils.py create mode 100644 tools/gke/big_query_utils.py create mode 100755 tools/gke/create_client.py create mode 100755 tools/gke/create_server.py create mode 100755 tools/gke/delete_client.py create mode 100755 tools/gke/delete_server.py create mode 100755 tools/run_tests/stress_test_wrapper.py (limited to 'tools') diff --git a/test/cpp/interop/metrics_client.cc b/test/cpp/interop/metrics_client.cc index 0c140ffd85..cc304f2e89 100644 --- a/test/cpp/interop/metrics_client.cc +++ b/test/cpp/interop/metrics_client.cc @@ -37,39 +37,45 @@ #include #include -#include "test/cpp/util/metrics_server.h" -#include "test/cpp/util/test_config.h" #include "src/proto/grpc/testing/metrics.grpc.pb.h" #include "src/proto/grpc/testing/metrics.pb.h" +#include "test/cpp/util/metrics_server.h" +#include "test/cpp/util/test_config.h" DEFINE_string(metrics_server_address, "", "The metrics server addresses in the fomrat :"); +DEFINE_bool(total_only, false, + "If true, this prints only the total value of all gauges"); + +int kDeadlineSecs = 10; using grpc::testing::EmptyMessage; using grpc::testing::GaugeResponse; using grpc::testing::MetricsService; using grpc::testing::MetricsServiceImpl; -void PrintMetrics(const grpc::string& server_address) { - gpr_log(GPR_INFO, "creating a channel to %s", server_address.c_str()); - std::shared_ptr channel( - grpc::CreateChannel(server_address, grpc::InsecureChannelCredentials())); - - std::unique_ptr stub(MetricsService::NewStub(channel)); - +// Prints the values of all Gauges (unless total_only is set to 'true' in which +// case this only prints the sum of all gauge values). +bool PrintMetrics(std::unique_ptr stub, bool total_only) { grpc::ClientContext context; EmptyMessage message; + std::chrono::system_clock::time_point deadline = + std::chrono::system_clock::now() + std::chrono::seconds(kDeadlineSecs); + + context.set_deadline(deadline); + std::unique_ptr> reader( stub->GetAllGauges(&context, message)); GaugeResponse gauge_response; long overall_qps = 0; - int idx = 0; while (reader->Read(&gauge_response)) { if (gauge_response.value_case() == GaugeResponse::kLongValue) { - gpr_log(GPR_INFO, "Gauge: %d (%s: %ld)", ++idx, - gauge_response.name().c_str(), gauge_response.long_value()); + if (!total_only) { + gpr_log(GPR_INFO, "%s: %ld", gauge_response.name().c_str(), + gauge_response.long_value()); + } overall_qps += gauge_response.long_value(); } else { gpr_log(GPR_INFO, "Gauge %s is not a long value", @@ -77,12 +83,14 @@ void PrintMetrics(const grpc::string& server_address) { } } - gpr_log(GPR_INFO, "OVERALL: %ld", overall_qps); + gpr_log(GPR_INFO, "%ld", overall_qps); const grpc::Status status = reader->Finish(); if (!status.ok()) { gpr_log(GPR_ERROR, "Error in getting metrics from the client"); } + + return status.ok(); } int main(int argc, char** argv) { @@ -97,7 +105,12 @@ int main(int argc, char** argv) { return 1; } - PrintMetrics(FLAGS_metrics_server_address); + std::shared_ptr channel(grpc::CreateChannel( + FLAGS_metrics_server_address, grpc::InsecureChannelCredentials())); + + if (!PrintMetrics(MetricsService::NewStub(channel), FLAGS_total_only)) { + return 1; + } return 0; } diff --git a/tools/bigquery/big_query_utils.py b/tools/bigquery/big_query_utils.py deleted file mode 100644 index ebcf9d6ec3..0000000000 --- a/tools/bigquery/big_query_utils.py +++ /dev/null @@ -1,181 +0,0 @@ -import argparse -import json -import uuid -import httplib2 - -from apiclient import discovery -from apiclient.errors import HttpError -from oauth2client.client import GoogleCredentials - -NUM_RETRIES = 3 - - -def create_bq(): - """Authenticates with cloud platform and gets a BiqQuery service object - """ - creds = GoogleCredentials.get_application_default() - return discovery.build('bigquery', 'v2', credentials=creds) - - -def create_ds(biq_query, project_id, dataset_id): - is_success = True - body = { - 'datasetReference': { - 'projectId': project_id, - 'datasetId': dataset_id - } - } - try: - dataset_req = biq_query.datasets().insert(projectId=project_id, body=body) - dataset_req.execute(num_retries=NUM_RETRIES) - except HttpError as http_error: - if http_error.resp.status == 409: - print 'Warning: The dataset %s already exists' % dataset_id - else: - # Note: For more debugging info, print "http_error.content" - print 'Error in creating dataset: %s. Err: %s' % (dataset_id, http_error) - is_success = False - return is_success - - -def make_field(field_name, field_type, field_description): - return { - 'name': field_name, - 'type': field_type, - 'description': field_description - } - - -def create_table(big_query, project_id, dataset_id, table_id, fields_list, - description): - is_success = True - body = { - 'description': description, - 'schema': { - 'fields': fields_list - }, - 'tableReference': { - 'datasetId': dataset_id, - 'projectId': project_id, - 'tableId': table_id - } - } - try: - table_req = big_query.tables().insert(projectId=project_id, - datasetId=dataset_id, - body=body) - res = table_req.execute(num_retries=NUM_RETRIES) - print 'Successfully created %s "%s"' % (res['kind'], res['id']) - except HttpError as http_error: - if http_error.resp.status == 409: - print 'Warning: Table %s already exists' % table_id - else: - print 'Error in creating table: %s. Err: %s' % (table_id, http_error) - is_success = False - return is_success - - -def insert_rows(big_query, project_id, dataset_id, table_id, rows_list): - is_success = True - body = {'rows': rows_list} - try: - insert_req = big_query.tabledata().insertAll(projectId=project_id, - datasetId=dataset_id, - tableId=table_id, - body=body) - print body - res = insert_req.execute(num_retries=NUM_RETRIES) - print res - except HttpError as http_error: - print 'Error in inserting rows in the table %s' % table_id - is_success = False - return is_success - -##################### - - -def make_emp_row(emp_id, emp_name, emp_email): - return { - 'insertId': str(emp_id), - 'json': { - 'emp_id': emp_id, - 'emp_name': emp_name, - 'emp_email_id': emp_email - } - } - - -def get_emp_table_fields_list(): - return [ - make_field('emp_id', 'INTEGER', 'Employee id'), - make_field('emp_name', 'STRING', 'Employee name'), - make_field('emp_email_id', 'STRING', 'Employee email id') - ] - - -def insert_emp_rows(big_query, project_id, dataset_id, table_id, start_idx, - num_rows): - rows_list = [make_emp_row(i, 'sree_%d' % i, 'sreecha_%d@gmail.com' % i) - for i in range(start_idx, start_idx + num_rows)] - insert_rows(big_query, project_id, dataset_id, table_id, rows_list) - - -def create_emp_table(big_query, project_id, dataset_id, table_id): - fields_list = get_emp_table_fields_list() - description = 'Test table created by sree' - create_table(big_query, project_id, dataset_id, table_id, fields_list, - description) - - -def sync_query(big_query, project_id, query, timeout=5000): - query_data = {'query': query, 'timeoutMs': timeout} - query_job = None - try: - query_job = big_query.jobs().query( - projectId=project_id, - body=query_data).execute(num_retries=NUM_RETRIES) - except HttpError as http_error: - print 'Query execute job failed with error: %s' % http_error - print http_error.content - return query_job - -#[Start query_emp_records] -def query_emp_records(big_query, project_id, dataset_id, table_id): - query = 'SELECT emp_id, emp_name FROM %s.%s ORDER BY emp_id;' % (dataset_id, table_id) - print query - query_job = sync_query(big_query, project_id, query, 5000) - job_id = query_job['jobReference'] - - print query_job - print '**Starting paging **' - #[Start Paging] - page_token = None - while True: - page = big_query.jobs().getQueryResults( - pageToken=page_token, - **query_job['jobReference']).execute(num_retries=NUM_RETRIES) - rows = page['rows'] - for row in rows: - print row['f'][0]['v'], "---", row['f'][1]['v'] - page_token = page.get('pageToken') - if not page_token: - break - #[End Paging] -#[End query_emp_records] - -######################### -DATASET_SEQ_NUM = 1 -TABLE_SEQ_NUM = 11 - -PROJECT_ID = 'sree-gce' -DATASET_ID = 'sree_test_dataset_%d' % DATASET_SEQ_NUM -TABLE_ID = 'sree_test_table_%d' % TABLE_SEQ_NUM - -EMP_ROW_IDX = 10 -EMP_NUM_ROWS = 5 - -bq = create_bq() -create_ds(bq, PROJECT_ID, DATASET_ID) -create_emp_table(bq, PROJECT_ID, DATASET_ID, TABLE_ID) -insert_emp_rows(bq, PROJECT_ID, DATASET_ID, TABLE_ID, EMP_ROW_IDX, EMP_NUM_ROWS) -query_emp_records(bq, PROJECT_ID, DATASET_ID, TABLE_ID) diff --git a/tools/dockerfile/grpc_interop_stress_cxx/build_interop_stress.sh b/tools/dockerfile/grpc_interop_stress_cxx/build_interop_stress.sh index 6ed3ccb3fa..392bdfccda 100755 --- a/tools/dockerfile/grpc_interop_stress_cxx/build_interop_stress.sh +++ b/tools/dockerfile/grpc_interop_stress_cxx/build_interop_stress.sh @@ -42,4 +42,4 @@ cd /var/local/git/grpc make install-certs # build C++ interop stress client, interop client and server -make stress_test interop_client interop_server +make stress_test metrics_client interop_client interop_server diff --git a/tools/gke/big_query_utils.py b/tools/gke/big_query_utils.py new file mode 100644 index 0000000000..ebcf9d6ec3 --- /dev/null +++ b/tools/gke/big_query_utils.py @@ -0,0 +1,181 @@ +import argparse +import json +import uuid +import httplib2 + +from apiclient import discovery +from apiclient.errors import HttpError +from oauth2client.client import GoogleCredentials + +NUM_RETRIES = 3 + + +def create_bq(): + """Authenticates with cloud platform and gets a BiqQuery service object + """ + creds = GoogleCredentials.get_application_default() + return discovery.build('bigquery', 'v2', credentials=creds) + + +def create_ds(biq_query, project_id, dataset_id): + is_success = True + body = { + 'datasetReference': { + 'projectId': project_id, + 'datasetId': dataset_id + } + } + try: + dataset_req = biq_query.datasets().insert(projectId=project_id, body=body) + dataset_req.execute(num_retries=NUM_RETRIES) + except HttpError as http_error: + if http_error.resp.status == 409: + print 'Warning: The dataset %s already exists' % dataset_id + else: + # Note: For more debugging info, print "http_error.content" + print 'Error in creating dataset: %s. Err: %s' % (dataset_id, http_error) + is_success = False + return is_success + + +def make_field(field_name, field_type, field_description): + return { + 'name': field_name, + 'type': field_type, + 'description': field_description + } + + +def create_table(big_query, project_id, dataset_id, table_id, fields_list, + description): + is_success = True + body = { + 'description': description, + 'schema': { + 'fields': fields_list + }, + 'tableReference': { + 'datasetId': dataset_id, + 'projectId': project_id, + 'tableId': table_id + } + } + try: + table_req = big_query.tables().insert(projectId=project_id, + datasetId=dataset_id, + body=body) + res = table_req.execute(num_retries=NUM_RETRIES) + print 'Successfully created %s "%s"' % (res['kind'], res['id']) + except HttpError as http_error: + if http_error.resp.status == 409: + print 'Warning: Table %s already exists' % table_id + else: + print 'Error in creating table: %s. Err: %s' % (table_id, http_error) + is_success = False + return is_success + + +def insert_rows(big_query, project_id, dataset_id, table_id, rows_list): + is_success = True + body = {'rows': rows_list} + try: + insert_req = big_query.tabledata().insertAll(projectId=project_id, + datasetId=dataset_id, + tableId=table_id, + body=body) + print body + res = insert_req.execute(num_retries=NUM_RETRIES) + print res + except HttpError as http_error: + print 'Error in inserting rows in the table %s' % table_id + is_success = False + return is_success + +##################### + + +def make_emp_row(emp_id, emp_name, emp_email): + return { + 'insertId': str(emp_id), + 'json': { + 'emp_id': emp_id, + 'emp_name': emp_name, + 'emp_email_id': emp_email + } + } + + +def get_emp_table_fields_list(): + return [ + make_field('emp_id', 'INTEGER', 'Employee id'), + make_field('emp_name', 'STRING', 'Employee name'), + make_field('emp_email_id', 'STRING', 'Employee email id') + ] + + +def insert_emp_rows(big_query, project_id, dataset_id, table_id, start_idx, + num_rows): + rows_list = [make_emp_row(i, 'sree_%d' % i, 'sreecha_%d@gmail.com' % i) + for i in range(start_idx, start_idx + num_rows)] + insert_rows(big_query, project_id, dataset_id, table_id, rows_list) + + +def create_emp_table(big_query, project_id, dataset_id, table_id): + fields_list = get_emp_table_fields_list() + description = 'Test table created by sree' + create_table(big_query, project_id, dataset_id, table_id, fields_list, + description) + + +def sync_query(big_query, project_id, query, timeout=5000): + query_data = {'query': query, 'timeoutMs': timeout} + query_job = None + try: + query_job = big_query.jobs().query( + projectId=project_id, + body=query_data).execute(num_retries=NUM_RETRIES) + except HttpError as http_error: + print 'Query execute job failed with error: %s' % http_error + print http_error.content + return query_job + +#[Start query_emp_records] +def query_emp_records(big_query, project_id, dataset_id, table_id): + query = 'SELECT emp_id, emp_name FROM %s.%s ORDER BY emp_id;' % (dataset_id, table_id) + print query + query_job = sync_query(big_query, project_id, query, 5000) + job_id = query_job['jobReference'] + + print query_job + print '**Starting paging **' + #[Start Paging] + page_token = None + while True: + page = big_query.jobs().getQueryResults( + pageToken=page_token, + **query_job['jobReference']).execute(num_retries=NUM_RETRIES) + rows = page['rows'] + for row in rows: + print row['f'][0]['v'], "---", row['f'][1]['v'] + page_token = page.get('pageToken') + if not page_token: + break + #[End Paging] +#[End query_emp_records] + +######################### +DATASET_SEQ_NUM = 1 +TABLE_SEQ_NUM = 11 + +PROJECT_ID = 'sree-gce' +DATASET_ID = 'sree_test_dataset_%d' % DATASET_SEQ_NUM +TABLE_ID = 'sree_test_table_%d' % TABLE_SEQ_NUM + +EMP_ROW_IDX = 10 +EMP_NUM_ROWS = 5 + +bq = create_bq() +create_ds(bq, PROJECT_ID, DATASET_ID) +create_emp_table(bq, PROJECT_ID, DATASET_ID, TABLE_ID) +insert_emp_rows(bq, PROJECT_ID, DATASET_ID, TABLE_ID, EMP_ROW_IDX, EMP_NUM_ROWS) +query_emp_records(bq, PROJECT_ID, DATASET_ID, TABLE_ID) diff --git a/tools/gke/create_client.py b/tools/gke/create_client.py new file mode 100755 index 0000000000..bc56ef0ef1 --- /dev/null +++ b/tools/gke/create_client.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python2.7 +# Copyright 2015, Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse + +import kubernetes_api + +argp = argparse.ArgumentParser(description='Launch Stress tests in GKE') + +argp.add_argument('-n', + '--num_instances', + required=True, + type=int, + help='The number of instances to launch in GKE') +args = argp.parse_args() + +kubernetes_api_server="localhost" +kubernetes_api_port=8001 + + +# Docker image +image_name="gcr.io/sree-gce/grpc_stress_test_2" + +server_address = "stress-server.default.svc.cluster.local:8080" +metrics_server_address = "localhost:8081" + +stress_test_arg_list=[ + "--server_addresses=" + server_address, + "--test_cases=empty_unary:20,large_unary:20", + "--num_stubs_per_channel=10" +] + +metrics_client_arg_list=[ + "--metrics_server_address=" + metrics_server_address, + "--total_only=true"] + +env_dict={ + "GPRC_ROOT": "/var/local/git/grpc", + "STRESS_TEST_IMAGE": "/var/local/git/grpc/bins/opt/stress_test", + "STRESS_TEST_ARGS_STR": ' '.join(stress_test_arg_list), + "METRICS_CLIENT_IMAGE": "/var/local/git/grpc/bins/opt/metrics_client", + "METRICS_CLIENT_ARGS_STR": ' '.join(metrics_client_arg_list)} + +cmd_list=["/var/local/git/grpc/bins/opt/stress_test"] +arg_list=stress_test_arg_list # make this [] in future +port_list=[8081] + +namespace = 'default' +is_headless_service = False # Client is NOT headless service + +print('Creating %d instances of client..' % args.num_instances) + +for i in range(1, args.num_instances + 1): + service_name = 'stress-client-%d' % i + pod_name = service_name # Use the same name for kubernetes Service and Pod + is_success = kubernetes_api.create_pod( + kubernetes_api_server, + kubernetes_api_port, + namespace, + pod_name, + image_name, + port_list, + cmd_list, + arg_list, + env_dict) + if not is_success: + print("Error in creating pod %s" % pod_name) + else: + is_success = kubernetes_api.create_service( + kubernetes_api_server, + kubernetes_api_port, + namespace, + service_name, + pod_name, + port_list, # Service port list + port_list, # Container port list (same as service port list) + is_headless_service) + if not is_success: + print("Error in creating service %s" % service_name) + else: + print("Created client %s" % pod_name) diff --git a/tools/gke/create_server.py b/tools/gke/create_server.py new file mode 100755 index 0000000000..23ab62c205 --- /dev/null +++ b/tools/gke/create_server.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python2.7 +# Copyright 2015, Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse + +import kubernetes_api + +service_name = 'stress-server' +pod_name = service_name # Use the same name for kubernetes Service and Pod +namespace = 'default' +is_headless_service = True +cmd_list=['/var/local/git/grpc/bins/opt/interop_server'] +arg_list=['--port=8080'] +port_list=[8080] +image_name='gcr.io/sree-gce/grpc_stress_test_2' +env_dict={} + +# Make sure you run kubectl proxy --port=8001 +kubernetes_api_server='localhost' +kubernetes_api_port=8001 + +is_success = kubernetes_api.create_pod( + kubernetes_api_server, + kubernetes_api_port, + namespace, + pod_name, + image_name, + port_list, + cmd_list, + arg_list, + env_dict) +if not is_success: + print("Error in creating pod") +else: + is_success = kubernetes_api.create_service( + kubernetes_api_server, + kubernetes_api_port, + namespace, + service_name, + pod_name, + port_list, # Service port list + port_list, # Container port list (same as service port list) + is_headless_service) + if not is_success: + print("Error in creating service") + else: + print("Successfully created the Server") diff --git a/tools/gke/delete_client.py b/tools/gke/delete_client.py new file mode 100755 index 0000000000..aa519f26b8 --- /dev/null +++ b/tools/gke/delete_client.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python2.7 +# Copyright 2015, Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse + +import kubernetes_api + +argp = argparse.ArgumentParser(description='Delete Stress test clients in GKE') +argp.add_argument('-n', + '--num_instances', + required=True, + type=int, + help='The number of instances currently running') + +args = argp.parse_args() +for i in range(1, args.num_instances + 1): + service_name = 'stress-client-%d' % i + pod_name = service_name + namespace = 'default' + kubernetes_api_server="localhost" + kubernetes_api_port=8001 + + is_success=kubernetes_api.delete_pod( + kubernetes_api_server, + kubernetes_api_port, + namespace, + pod_name) + if not is_success: + print('Error in deleting Pod %s' % pod_name) + else: + is_success= kubernetes_api.delete_service( + kubernetes_api_server, + kubernetes_api_port, + namespace, + service_name) + if not is_success: + print('Error in deleting Service %s' % service_name) + else: + print('Deleted %s' % pod_name) diff --git a/tools/gke/delete_server.py b/tools/gke/delete_server.py new file mode 100755 index 0000000000..6e3fdcc33b --- /dev/null +++ b/tools/gke/delete_server.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python2.7 +# Copyright 2015, Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse + +import kubernetes_api + +service_name = 'stress-server' +pod_name = service_name # Use the same name for kubernetes Service and Pod +namespace = 'default' +is_headless_service = True +kubernetes_api_server="localhost" +kubernetes_api_port=8001 + +is_success = kubernetes_api.delete_pod( + kubernetes_api_server, + kubernetes_api_port, + namespace, + pod_name) +if not is_success: + print("Error in deleting Pod %s" % pod_name) +else: + is_success = kubernetes_api.delete_service( + kubernetes_api_server, + kubernetes_api_port, + namespace, + service_name) + if not is_success: + print("Error in deleting Service %d" % service_name) + else: + print("Deleted server %s" % service_name) diff --git a/tools/gke/kubernetes_api.py b/tools/gke/kubernetes_api.py index 7dd3015365..14d724bd31 100755 --- a/tools/gke/kubernetes_api.py +++ b/tools/gke/kubernetes_api.py @@ -33,8 +33,9 @@ import json _REQUEST_TIMEOUT_SECS = 10 + def _make_pod_config(pod_name, image_name, container_port_list, cmd_list, - arg_list): + arg_list, env_dict): """Creates a string containing the Pod defintion as required by the Kubernetes API""" body = { 'kind': 'Pod', @@ -48,20 +49,21 @@ def _make_pod_config(pod_name, image_name, container_port_list, cmd_list, { 'name': pod_name, 'image': image_name, - 'ports': [] + 'ports': [{'containerPort': port, + 'protocol': 'TCP'} for port in container_port_list] } ] } } - # Populate the 'ports' list - for port in container_port_list: - port_entry = {'containerPort': port, 'protocol': 'TCP'} - body['spec']['containers'][0]['ports'].append(port_entry) + + env_list = [{'name': k, 'value': v} for (k, v) in env_dict.iteritems()] + if len(env_list) > 0: + body['spec']['containers'][0]['env'] = env_list # Add the 'Command' and 'Args' attributes if they are passed. # Note: # - 'Command' overrides the ENTRYPOINT in the Docker Image - # - 'Args' override the COMMAND in Docker image (yes, it is confusing!) + # - 'Args' override the CMD in Docker image (yes, it is confusing!) if len(cmd_list) > 0: body['spec']['containers'][0]['command'] = cmd_list if len(arg_list) > 0: @@ -70,7 +72,7 @@ def _make_pod_config(pod_name, image_name, container_port_list, cmd_list, def _make_service_config(service_name, pod_name, service_port_list, - container_port_list, is_headless): + container_port_list, is_headless): """Creates a string containing the Service definition as required by the Kubernetes API. NOTE: @@ -124,6 +126,7 @@ def _print_connection_error(msg): print('ERROR: Connection failed. Did you remember to run Kubenetes proxy on ' 'localhost (i.e kubectl proxy --port=) ?. Error: %s' % msg) + def _do_post(post_url, api_name, request_body): """Helper to do HTTP POST. @@ -135,7 +138,9 @@ def _do_post(post_url, api_name, request_body): """ is_success = True try: - r = requests.post(post_url, data=request_body, timeout=_REQUEST_TIMEOUT_SECS) + r = requests.post(post_url, + data=request_body, + timeout=_REQUEST_TIMEOUT_SECS) if r.status_code == requests.codes.conflict: print('WARN: Looks like the resource already exists. Api: %s, url: %s' % (api_name, post_url)) @@ -143,7 +148,8 @@ def _do_post(post_url, api_name, request_body): print('ERROR: %s API returned error. HTTP response: (%d) %s' % (api_name, r.status_code, r.text)) is_success = False - except(requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e: + except (requests.exceptions.Timeout, + requests.exceptions.ConnectionError) as e: is_success = False _print_connection_error(str(e)) return is_success @@ -165,7 +171,8 @@ def _do_delete(del_url, api_name): print('ERROR: %s API returned error. HTTP response: %s' % (api_name, r.text)) is_success = False - except(requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e: + except (requests.exceptions.Timeout, + requests.exceptions.ConnectionError) as e: is_success = False _print_connection_error(str(e)) return is_success @@ -179,12 +186,12 @@ def create_service(kube_host, kube_port, namespace, service_name, pod_name, post_url = 'http://%s:%d/api/v1/namespaces/%s/services' % ( kube_host, kube_port, namespace) request_body = _make_service_config(service_name, pod_name, service_port_list, - container_port_list, is_headless) + container_port_list, is_headless) return _do_post(post_url, 'Create Service', request_body) def create_pod(kube_host, kube_port, namespace, pod_name, image_name, - container_port_list, cmd_list, arg_list): + container_port_list, cmd_list, arg_list, env_dict): """Creates a Kubernetes Pod. Note that it is generally NOT considered a good practice to directly create @@ -200,7 +207,7 @@ def create_pod(kube_host, kube_port, namespace, pod_name, image_name, post_url = 'http://%s:%d/api/v1/namespaces/%s/pods' % (kube_host, kube_port, namespace) request_body = _make_pod_config(pod_name, image_name, container_port_list, - cmd_list, arg_list) + cmd_list, arg_list, env_dict) return _do_post(post_url, 'Create Pod', request_body) diff --git a/tools/run_tests/stress_test_wrapper.py b/tools/run_tests/stress_test_wrapper.py new file mode 100755 index 0000000000..8f1bd2024e --- /dev/null +++ b/tools/run_tests/stress_test_wrapper.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python2.7 +import os +import re +import select +import subprocess +import sys +import time + +GRPC_ROOT = '/usr/local/google/home/sreek/workspace/grpc/' +STRESS_TEST_IMAGE = GRPC_ROOT + 'bins/opt/stress_test' +STRESS_TEST_ARGS_STR = ' '.join([ + '--server_addresses=localhost:8000', + '--test_cases=empty_unary:1,large_unary:1', '--num_stubs_per_channel=10', + '--test_duration_secs=10']) +METRICS_CLIENT_IMAGE = GRPC_ROOT + 'bins/opt/metrics_client' +METRICS_CLIENT_ARGS_STR = ' '.join([ + '--metrics_server_address=localhost:8081', '--total_only=true']) +LOGFILE_NAME = 'stress_test.log' + + +# TODO (sree): Write a python grpc client to directly query the metrics instead +# of calling metrics_client +def get_qps(metrics_cmd): + qps = 0 + try: + # Note: gpr_log() writes even non-error messages to stderr stream. So it is + # important that we set stderr=subprocess.STDOUT + p = subprocess.Popen(args=metrics_cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + retcode = p.wait() + (out_str, err_str) = p.communicate() + if retcode != 0: + print 'Error in reading metrics information' + print 'Output: ', out_str + else: + # The overall qps is printed at the end of the line + m = re.search('\d+$', out_str) + qps = int(m.group()) if m else 0 + except Exception as ex: + print 'Exception while reading metrics information: ' + str(ex) + return qps + +def main(argv): + # TODO(sree) Create BigQuery Tables + # (Summary table), (Metrics table) + + # TODO(sree) Update status that the test is starting (in the status table) + # + + metrics_cmd = [METRICS_CLIENT_IMAGE + ] + [x for x in METRICS_CLIENT_ARGS_STR.split()] + + stress_cmd = [STRESS_TEST_IMAGE] + [x for x in STRESS_TEST_ARGS_STR.split()] + # TODO(sree): Add an option to print to stdout if logfilename is absent + logfile = open(LOGFILE_NAME, 'w') + stress_p = subprocess.Popen(args=arg_list, + stdout=logfile, + stderr=subprocess.STDOUT) + + qps_history = [1, 1, 1] # Maintain the last 3 qps + qps_history_idx = 0 # Index into the qps_history list + + is_error = False + while True: + # Check if stress_client is still running. If so, collect metrics and upload + # to BigQuery status table + # + if stress_p is not None: + # TODO(sree) Upload completion status to BiqQuery + is_error = (stress_p.returncode != 0) + break + + # Stress client still running. Get metrics + qps = get_qps(metrics_cmd) + + # If QPS has been zero for the last 3 iterations, flag it as error and exit + qps_history[qps_history_idx] = qps + qps_history_idx = (qps_histor_idx + 1) % len(qps_history) + if sum(a) == 0: + print ('QPS has been zero for the last 3 iterations. Not monitoring ' + 'anymore. The stress test client may be stalled.') + is_error = True + break + + #TODO(sree) Upload qps metrics to BiqQuery + + if is_error: + print 'Waiting indefinitely..' + select.select([],[],[]) + + return 1 + + +if __name__ == '__main__': + main(sys.argv[1:]) -- cgit v1.2.3 From 559e45becd0a50bd6af850900abbb2b5759f8719 Mon Sep 17 00:00:00 2001 From: Sree Kuchibhotla Date: Fri, 19 Feb 2016 03:02:16 -0800 Subject: Scripts to launch stress tests in GKE --- test/cpp/util/metrics_server.cc | 2 +- tools/big_query/big_query_utils.py | 140 ++++++++ tools/gke/big_query_utils.py | 181 ----------- tools/gke/create_client.py | 108 ------- tools/gke/create_server.py | 74 ----- tools/gke/delete_client.py | 66 ---- tools/gke/delete_server.py | 58 ---- tools/gke/kubernetes_api.py | 5 +- tools/gke/run_stress_tests_on_gke.py | 389 +++++++++++++++++++++++ tools/run_tests/stress_test/run_client.py | 188 +++++++++++ tools/run_tests/stress_test/run_server.py | 115 +++++++ tools/run_tests/stress_test/stress_test_utils.py | 192 +++++++++++ tools/run_tests/stress_test_wrapper.py | 96 ------ 13 files changed, 1028 insertions(+), 586 deletions(-) create mode 100755 tools/big_query/big_query_utils.py delete mode 100644 tools/gke/big_query_utils.py delete mode 100755 tools/gke/create_client.py delete mode 100755 tools/gke/create_server.py delete mode 100755 tools/gke/delete_client.py delete mode 100755 tools/gke/delete_server.py create mode 100755 tools/gke/run_stress_tests_on_gke.py create mode 100755 tools/run_tests/stress_test/run_client.py create mode 100755 tools/run_tests/stress_test/run_server.py create mode 100755 tools/run_tests/stress_test/stress_test_utils.py delete mode 100755 tools/run_tests/stress_test_wrapper.py (limited to 'tools') diff --git a/test/cpp/util/metrics_server.cc b/test/cpp/util/metrics_server.cc index 07978d0bdb..34d51eb316 100644 --- a/test/cpp/util/metrics_server.cc +++ b/test/cpp/util/metrics_server.cc @@ -57,7 +57,7 @@ long Gauge::Get() { grpc::Status MetricsServiceImpl::GetAllGauges( ServerContext* context, const EmptyMessage* request, ServerWriter* writer) { - gpr_log(GPR_INFO, "GetAllGauges called"); + gpr_log(GPR_DEBUG, "GetAllGauges called"); std::lock_guard lock(mu_); for (auto it = gauges_.begin(); it != gauges_.end(); it++) { diff --git a/tools/big_query/big_query_utils.py b/tools/big_query/big_query_utils.py new file mode 100755 index 0000000000..267d019850 --- /dev/null +++ b/tools/big_query/big_query_utils.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python2.7 +# Copyright 2015-2016 Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import json +import uuid +import httplib2 + +from apiclient import discovery +from apiclient.errors import HttpError +from oauth2client.client import GoogleCredentials + +NUM_RETRIES = 3 + + +def create_big_query(): + """Authenticates with cloud platform and gets a BiqQuery service object + """ + creds = GoogleCredentials.get_application_default() + return discovery.build('bigquery', 'v2', credentials=creds) + + +def create_dataset(biq_query, project_id, dataset_id): + is_success = True + body = { + 'datasetReference': { + 'projectId': project_id, + 'datasetId': dataset_id + } + } + + try: + dataset_req = biq_query.datasets().insert(projectId=project_id, body=body) + dataset_req.execute(num_retries=NUM_RETRIES) + except HttpError as http_error: + if http_error.resp.status == 409: + print 'Warning: The dataset %s already exists' % dataset_id + else: + # Note: For more debugging info, print "http_error.content" + print 'Error in creating dataset: %s. Err: %s' % (dataset_id, http_error) + is_success = False + return is_success + + +def create_table(big_query, project_id, dataset_id, table_id, table_schema, + description): + is_success = True + + body = { + 'description': description, + 'schema': { + 'fields': [{ + 'name': field_name, + 'type': field_type, + 'description': field_description + } for (field_name, field_type, field_description) in table_schema] + }, + 'tableReference': { + 'datasetId': dataset_id, + 'projectId': project_id, + 'tableId': table_id + } + } + + try: + table_req = big_query.tables().insert(projectId=project_id, + datasetId=dataset_id, + body=body) + res = table_req.execute(num_retries=NUM_RETRIES) + print 'Successfully created %s "%s"' % (res['kind'], res['id']) + except HttpError as http_error: + if http_error.resp.status == 409: + print 'Warning: Table %s already exists' % table_id + else: + print 'Error in creating table: %s. Err: %s' % (table_id, http_error) + is_success = False + return is_success + + +def insert_rows(big_query, project_id, dataset_id, table_id, rows_list): + is_success = True + body = {'rows': rows_list} + try: + insert_req = big_query.tabledata().insertAll(projectId=project_id, + datasetId=dataset_id, + tableId=table_id, + body=body) + print body + res = insert_req.execute(num_retries=NUM_RETRIES) + print res + except HttpError as http_error: + print 'Error in inserting rows in the table %s' % table_id + is_success = False + return is_success + + +def sync_query_job(big_query, project_id, query, timeout=5000): + query_data = {'query': query, 'timeoutMs': timeout} + query_job = None + try: + query_job = big_query.jobs().query( + projectId=project_id, + body=query_data).execute(num_retries=NUM_RETRIES) + except HttpError as http_error: + print 'Query execute job failed with error: %s' % http_error + print http_error.content + return query_job + + # List of (column name, column type, description) tuples +def make_row(unique_row_id, row_values_dict): + """row_values_dict is a dictionar of column name and column value. + """ + return {'insertId': unique_row_id, 'json': row_values_dict} diff --git a/tools/gke/big_query_utils.py b/tools/gke/big_query_utils.py deleted file mode 100644 index ebcf9d6ec3..0000000000 --- a/tools/gke/big_query_utils.py +++ /dev/null @@ -1,181 +0,0 @@ -import argparse -import json -import uuid -import httplib2 - -from apiclient import discovery -from apiclient.errors import HttpError -from oauth2client.client import GoogleCredentials - -NUM_RETRIES = 3 - - -def create_bq(): - """Authenticates with cloud platform and gets a BiqQuery service object - """ - creds = GoogleCredentials.get_application_default() - return discovery.build('bigquery', 'v2', credentials=creds) - - -def create_ds(biq_query, project_id, dataset_id): - is_success = True - body = { - 'datasetReference': { - 'projectId': project_id, - 'datasetId': dataset_id - } - } - try: - dataset_req = biq_query.datasets().insert(projectId=project_id, body=body) - dataset_req.execute(num_retries=NUM_RETRIES) - except HttpError as http_error: - if http_error.resp.status == 409: - print 'Warning: The dataset %s already exists' % dataset_id - else: - # Note: For more debugging info, print "http_error.content" - print 'Error in creating dataset: %s. Err: %s' % (dataset_id, http_error) - is_success = False - return is_success - - -def make_field(field_name, field_type, field_description): - return { - 'name': field_name, - 'type': field_type, - 'description': field_description - } - - -def create_table(big_query, project_id, dataset_id, table_id, fields_list, - description): - is_success = True - body = { - 'description': description, - 'schema': { - 'fields': fields_list - }, - 'tableReference': { - 'datasetId': dataset_id, - 'projectId': project_id, - 'tableId': table_id - } - } - try: - table_req = big_query.tables().insert(projectId=project_id, - datasetId=dataset_id, - body=body) - res = table_req.execute(num_retries=NUM_RETRIES) - print 'Successfully created %s "%s"' % (res['kind'], res['id']) - except HttpError as http_error: - if http_error.resp.status == 409: - print 'Warning: Table %s already exists' % table_id - else: - print 'Error in creating table: %s. Err: %s' % (table_id, http_error) - is_success = False - return is_success - - -def insert_rows(big_query, project_id, dataset_id, table_id, rows_list): - is_success = True - body = {'rows': rows_list} - try: - insert_req = big_query.tabledata().insertAll(projectId=project_id, - datasetId=dataset_id, - tableId=table_id, - body=body) - print body - res = insert_req.execute(num_retries=NUM_RETRIES) - print res - except HttpError as http_error: - print 'Error in inserting rows in the table %s' % table_id - is_success = False - return is_success - -##################### - - -def make_emp_row(emp_id, emp_name, emp_email): - return { - 'insertId': str(emp_id), - 'json': { - 'emp_id': emp_id, - 'emp_name': emp_name, - 'emp_email_id': emp_email - } - } - - -def get_emp_table_fields_list(): - return [ - make_field('emp_id', 'INTEGER', 'Employee id'), - make_field('emp_name', 'STRING', 'Employee name'), - make_field('emp_email_id', 'STRING', 'Employee email id') - ] - - -def insert_emp_rows(big_query, project_id, dataset_id, table_id, start_idx, - num_rows): - rows_list = [make_emp_row(i, 'sree_%d' % i, 'sreecha_%d@gmail.com' % i) - for i in range(start_idx, start_idx + num_rows)] - insert_rows(big_query, project_id, dataset_id, table_id, rows_list) - - -def create_emp_table(big_query, project_id, dataset_id, table_id): - fields_list = get_emp_table_fields_list() - description = 'Test table created by sree' - create_table(big_query, project_id, dataset_id, table_id, fields_list, - description) - - -def sync_query(big_query, project_id, query, timeout=5000): - query_data = {'query': query, 'timeoutMs': timeout} - query_job = None - try: - query_job = big_query.jobs().query( - projectId=project_id, - body=query_data).execute(num_retries=NUM_RETRIES) - except HttpError as http_error: - print 'Query execute job failed with error: %s' % http_error - print http_error.content - return query_job - -#[Start query_emp_records] -def query_emp_records(big_query, project_id, dataset_id, table_id): - query = 'SELECT emp_id, emp_name FROM %s.%s ORDER BY emp_id;' % (dataset_id, table_id) - print query - query_job = sync_query(big_query, project_id, query, 5000) - job_id = query_job['jobReference'] - - print query_job - print '**Starting paging **' - #[Start Paging] - page_token = None - while True: - page = big_query.jobs().getQueryResults( - pageToken=page_token, - **query_job['jobReference']).execute(num_retries=NUM_RETRIES) - rows = page['rows'] - for row in rows: - print row['f'][0]['v'], "---", row['f'][1]['v'] - page_token = page.get('pageToken') - if not page_token: - break - #[End Paging] -#[End query_emp_records] - -######################### -DATASET_SEQ_NUM = 1 -TABLE_SEQ_NUM = 11 - -PROJECT_ID = 'sree-gce' -DATASET_ID = 'sree_test_dataset_%d' % DATASET_SEQ_NUM -TABLE_ID = 'sree_test_table_%d' % TABLE_SEQ_NUM - -EMP_ROW_IDX = 10 -EMP_NUM_ROWS = 5 - -bq = create_bq() -create_ds(bq, PROJECT_ID, DATASET_ID) -create_emp_table(bq, PROJECT_ID, DATASET_ID, TABLE_ID) -insert_emp_rows(bq, PROJECT_ID, DATASET_ID, TABLE_ID, EMP_ROW_IDX, EMP_NUM_ROWS) -query_emp_records(bq, PROJECT_ID, DATASET_ID, TABLE_ID) diff --git a/tools/gke/create_client.py b/tools/gke/create_client.py deleted file mode 100755 index bc56ef0ef1..0000000000 --- a/tools/gke/create_client.py +++ /dev/null @@ -1,108 +0,0 @@ -#!/usr/bin/env python2.7 -# Copyright 2015, Google Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following disclaimer -# in the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Google Inc. nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import argparse - -import kubernetes_api - -argp = argparse.ArgumentParser(description='Launch Stress tests in GKE') - -argp.add_argument('-n', - '--num_instances', - required=True, - type=int, - help='The number of instances to launch in GKE') -args = argp.parse_args() - -kubernetes_api_server="localhost" -kubernetes_api_port=8001 - - -# Docker image -image_name="gcr.io/sree-gce/grpc_stress_test_2" - -server_address = "stress-server.default.svc.cluster.local:8080" -metrics_server_address = "localhost:8081" - -stress_test_arg_list=[ - "--server_addresses=" + server_address, - "--test_cases=empty_unary:20,large_unary:20", - "--num_stubs_per_channel=10" -] - -metrics_client_arg_list=[ - "--metrics_server_address=" + metrics_server_address, - "--total_only=true"] - -env_dict={ - "GPRC_ROOT": "/var/local/git/grpc", - "STRESS_TEST_IMAGE": "/var/local/git/grpc/bins/opt/stress_test", - "STRESS_TEST_ARGS_STR": ' '.join(stress_test_arg_list), - "METRICS_CLIENT_IMAGE": "/var/local/git/grpc/bins/opt/metrics_client", - "METRICS_CLIENT_ARGS_STR": ' '.join(metrics_client_arg_list)} - -cmd_list=["/var/local/git/grpc/bins/opt/stress_test"] -arg_list=stress_test_arg_list # make this [] in future -port_list=[8081] - -namespace = 'default' -is_headless_service = False # Client is NOT headless service - -print('Creating %d instances of client..' % args.num_instances) - -for i in range(1, args.num_instances + 1): - service_name = 'stress-client-%d' % i - pod_name = service_name # Use the same name for kubernetes Service and Pod - is_success = kubernetes_api.create_pod( - kubernetes_api_server, - kubernetes_api_port, - namespace, - pod_name, - image_name, - port_list, - cmd_list, - arg_list, - env_dict) - if not is_success: - print("Error in creating pod %s" % pod_name) - else: - is_success = kubernetes_api.create_service( - kubernetes_api_server, - kubernetes_api_port, - namespace, - service_name, - pod_name, - port_list, # Service port list - port_list, # Container port list (same as service port list) - is_headless_service) - if not is_success: - print("Error in creating service %s" % service_name) - else: - print("Created client %s" % pod_name) diff --git a/tools/gke/create_server.py b/tools/gke/create_server.py deleted file mode 100755 index 23ab62c205..0000000000 --- a/tools/gke/create_server.py +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env python2.7 -# Copyright 2015, Google Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following disclaimer -# in the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Google Inc. nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import argparse - -import kubernetes_api - -service_name = 'stress-server' -pod_name = service_name # Use the same name for kubernetes Service and Pod -namespace = 'default' -is_headless_service = True -cmd_list=['/var/local/git/grpc/bins/opt/interop_server'] -arg_list=['--port=8080'] -port_list=[8080] -image_name='gcr.io/sree-gce/grpc_stress_test_2' -env_dict={} - -# Make sure you run kubectl proxy --port=8001 -kubernetes_api_server='localhost' -kubernetes_api_port=8001 - -is_success = kubernetes_api.create_pod( - kubernetes_api_server, - kubernetes_api_port, - namespace, - pod_name, - image_name, - port_list, - cmd_list, - arg_list, - env_dict) -if not is_success: - print("Error in creating pod") -else: - is_success = kubernetes_api.create_service( - kubernetes_api_server, - kubernetes_api_port, - namespace, - service_name, - pod_name, - port_list, # Service port list - port_list, # Container port list (same as service port list) - is_headless_service) - if not is_success: - print("Error in creating service") - else: - print("Successfully created the Server") diff --git a/tools/gke/delete_client.py b/tools/gke/delete_client.py deleted file mode 100755 index aa519f26b8..0000000000 --- a/tools/gke/delete_client.py +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/env python2.7 -# Copyright 2015, Google Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following disclaimer -# in the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Google Inc. nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import argparse - -import kubernetes_api - -argp = argparse.ArgumentParser(description='Delete Stress test clients in GKE') -argp.add_argument('-n', - '--num_instances', - required=True, - type=int, - help='The number of instances currently running') - -args = argp.parse_args() -for i in range(1, args.num_instances + 1): - service_name = 'stress-client-%d' % i - pod_name = service_name - namespace = 'default' - kubernetes_api_server="localhost" - kubernetes_api_port=8001 - - is_success=kubernetes_api.delete_pod( - kubernetes_api_server, - kubernetes_api_port, - namespace, - pod_name) - if not is_success: - print('Error in deleting Pod %s' % pod_name) - else: - is_success= kubernetes_api.delete_service( - kubernetes_api_server, - kubernetes_api_port, - namespace, - service_name) - if not is_success: - print('Error in deleting Service %s' % service_name) - else: - print('Deleted %s' % pod_name) diff --git a/tools/gke/delete_server.py b/tools/gke/delete_server.py deleted file mode 100755 index 6e3fdcc33b..0000000000 --- a/tools/gke/delete_server.py +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env python2.7 -# Copyright 2015, Google Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following disclaimer -# in the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Google Inc. nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import argparse - -import kubernetes_api - -service_name = 'stress-server' -pod_name = service_name # Use the same name for kubernetes Service and Pod -namespace = 'default' -is_headless_service = True -kubernetes_api_server="localhost" -kubernetes_api_port=8001 - -is_success = kubernetes_api.delete_pod( - kubernetes_api_server, - kubernetes_api_port, - namespace, - pod_name) -if not is_success: - print("Error in deleting Pod %s" % pod_name) -else: - is_success = kubernetes_api.delete_service( - kubernetes_api_server, - kubernetes_api_port, - namespace, - service_name) - if not is_success: - print("Error in deleting Service %d" % service_name) - else: - print("Deleted server %s" % service_name) diff --git a/tools/gke/kubernetes_api.py b/tools/gke/kubernetes_api.py index 14d724bd31..d14c26ad6a 100755 --- a/tools/gke/kubernetes_api.py +++ b/tools/gke/kubernetes_api.py @@ -1,5 +1,5 @@ #!/usr/bin/env python2.7 -# Copyright 2015, Google Inc. +# Copyright 2015-2016 Google Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -50,7 +50,8 @@ def _make_pod_config(pod_name, image_name, container_port_list, cmd_list, 'name': pod_name, 'image': image_name, 'ports': [{'containerPort': port, - 'protocol': 'TCP'} for port in container_port_list] + 'protocol': 'TCP'} for port in container_port_list], + 'imagePullPolicy': 'Always' } ] } diff --git a/tools/gke/run_stress_tests_on_gke.py b/tools/gke/run_stress_tests_on_gke.py new file mode 100755 index 0000000000..d0c3887a42 --- /dev/null +++ b/tools/gke/run_stress_tests_on_gke.py @@ -0,0 +1,389 @@ +#!/usr/bin/env python2.7 +# Copyright 2015-2016, Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import datetime +import os +import subprocess +import sys +import time + +import kubernetes_api + +GRPC_ROOT = os.path.abspath(os.path.join(os.path.dirname(sys.argv[0]), '../..')) +os.chdir(GRPC_ROOT) + +class BigQuerySettings: + + def __init__(self, run_id, dataset_id, summary_table_id, qps_table_id): + self.run_id = run_id + self.dataset_id = dataset_id + self.summary_table_id = summary_table_id + self.qps_table_id = qps_table_id + + +class KubernetesProxy: + """ Class to start a proxy on localhost to the Kubernetes API server """ + + def __init__(self, api_port): + self.port = api_port + self.p = None + self.started = False + + def start(self): + cmd = ['kubectl', 'proxy', '--port=%d' % self.port] + self.p = subprocess.Popen(args=cmd) + self.started = True + time.sleep(2) + print '..Started' + + def get_port(self): + return self.port + + def is_started(self): + return self.started + + def __del__(self): + if self.p is not None: + self.p.kill() + + +def _build_docker_image(image_name, tag_name): + """ Build the docker image and add a tag """ + os.environ['INTEROP_IMAGE'] = image_name + # Note that 'BASE_NAME' HAS to be 'grpc_interop_stress_cxx' since the script + # build_interop_stress_image.sh invokes the following script: + # tools/dockerfile/$BASE_NAME/build_interop_stress.sh + os.environ['BASE_NAME'] = 'grpc_interop_stress_cxx' + cmd = ['tools/jenkins/build_interop_stress_image.sh'] + p = subprocess.Popen(args=cmd) + retcode = p.wait() + if retcode != 0: + print 'Error in building docker image' + return False + + cmd = ['docker', 'tag', '-f', image_name, tag_name] + p = subprocess.Popen(args=cmd) + retcode = p.wait() + if retcode != 0: + print 'Error in creating the tag %s for %s' % (tag_name, image_name) + return False + + return True + + +def _push_docker_image_to_gke_registry(docker_tag_name): + """Executes 'gcloud docker push ' to push the image to GKE registry""" + cmd = ['gcloud', 'docker', 'push', docker_tag_name] + print 'Pushing %s to GKE registry..' % docker_tag_name + p = subprocess.Popen(args=cmd) + retcode = p.wait() + if retcode != 0: + print 'Error in pushing docker image %s to the GKE registry' % docker_tag_name + return False + return True + + +def _launch_image_on_gke(kubernetes_api_server, kubernetes_api_port, namespace, + pod_name, image_name, port_list, cmd_list, arg_list, + env_dict, is_headless_service): + """Creates a GKE Pod and a Service object for a given image by calling Kubernetes API""" + is_success = kubernetes_api.create_pod( + kubernetes_api_server, + kubernetes_api_port, + namespace, + pod_name, + image_name, + port_list, # The ports to be exposed on this container/pod + cmd_list, # The command that launches the stress server + arg_list, + env_dict # Environment variables to be passed to the pod + ) + if not is_success: + print 'Error in creating Pod' + return False + + is_success = kubernetes_api.create_service( + kubernetes_api_server, + kubernetes_api_port, + namespace, + pod_name, # Use the pod name for service name as well + pod_name, + port_list, # Service port list + port_list, # Container port list (same as service port list) + is_headless_service) + if not is_success: + print 'Error in creating Service' + return False + + print 'Successfully created the pod/service %s' % pod_name + return True + + +def _delete_image_on_gke(kubernetes_proxy, pod_name_list): + """Deletes a GKE Pod and Service object for given list of Pods by calling Kubernetes API""" + if not kubernetes_proxy.is_started: + print 'Kubernetes proxy must be started before calling this function' + return False + + is_success = True + for pod_name in pod_name_list: + is_success = kubernetes_api.delete_pod( + 'localhost', kubernetes_proxy.get_port(), 'default', pod_name) + if not is_success: + print 'Error in deleting pod %s' % pod_name + break + + is_success = kubernetes_api.delete_service( + 'localhost', kubernetes_proxy.get_port(), 'default', + pod_name) # service name same as pod name + if not is_success: + print 'Error in deleting service %s' % pod_name + break + + if is_success: + print 'Successfully deleted the Pods/Services: %s' % ','.join(pod_name_list) + + return is_success + + +def _launch_server(gcp_project_id, docker_image_name, bq_settings, + kubernetes_proxy, server_pod_name, server_port): + """ Launches a stress test server instance in GKE cluster """ + if not kubernetes_proxy.is_started: + print 'Kubernetes proxy must be started before calling this function' + return False + + server_cmd_list = [ + '/var/local/git/grpc/tools/run_tests/stress_test/run_server.py' + ] # Process that is launched + server_arg_list = [] # run_server.py does not take any args (for now) + + # == Parameters to the server process launched in GKE == + server_env = { + 'STRESS_TEST_IMAGE_TYPE': 'SERVER', + 'STRESS_TEST_IMAGE': '/var/local/git/grpc/bins/opt/interop_server', + 'STRESS_TEST_ARGS_STR': '--port=%s' % server_port, + 'RUN_ID': bq_settings.run_id, + 'POD_NAME': server_pod_name, + 'GCP_PROJECT_ID': gcp_project_id, + 'DATASET_ID': bq_settings.dataset_id, + 'SUMMARY_TABLE_ID': bq_settings.summary_table_id, + 'QPS_TABLE_ID': bq_settings.qps_table_id + } + + # Launch Server + is_success = _launch_image_on_gke( + 'localhost', + kubernetes_proxy.get_port(), + 'default', + server_pod_name, + docker_image_name, + [server_port], # Port that should be exposed on the container + server_cmd_list, + server_arg_list, + server_env, + True # Headless = True for server. Since we want DNS records to be greated by GKE + ) + + return is_success + + +def _launch_client(gcp_project_id, docker_image_name, bq_settings, + kubernetes_proxy, num_instances, client_pod_name_prefix, + server_pod_name, server_port): + """ Launches a configurable number of stress test clients on GKE cluster """ + if not kubernetes_proxy.is_started: + print 'Kubernetes proxy must be started before calling this function' + return False + + server_address = '%s.default.svc.cluster.local:%d' % (server_pod_name, + server_port) + #TODO(sree) Make the whole client args configurable + test_cases_str = 'empty_unary:1,large_unary:1' + stress_client_arg_list = [ + '--server_addresses=%s' % server_address, + '--test_cases=%s' % test_cases_str, '--num_stubs_per_channel=10' + ] + + client_cmd_list = [ + '/var/local/git/grpc/tools/run_tests/stress_test/run_client.py' + ] + # run_client.py takes no args. All args are passed as env variables + client_arg_list = [] + + # TODO(sree) Make this configurable (and also less frequent) + poll_interval_secs = 5 + + metrics_port = 8081 + metrics_server_address = 'localhost:%d' % metrics_port + metrics_client_arg_list = [ + '--metrics_server_address=%s' % metrics_server_address, + '--total_only=true' + ] + + client_env = { + 'STRESS_TEST_IMAGE_TYPE': 'CLIENT', + 'STRESS_TEST_IMAGE': '/var/local/git/grpc/bins/opt/stress_test', + 'STRESS_TEST_ARGS_STR': ' '.join(stress_client_arg_list), + 'METRICS_CLIENT_IMAGE': '/var/local/git/grpc/bins/opt/metrics_client', + 'METRICS_CLIENT_ARGS_STR': ' '.join(metrics_client_arg_list), + 'RUN_ID': bq_settings.run_id, + 'POLL_INTERVAL_SECS': str(poll_interval_secs), + 'GCP_PROJECT_ID': gcp_project_id, + 'DATASET_ID': bq_settings.dataset_id, + 'SUMMARY_TABLE_ID': bq_settings.summary_table_id, + 'QPS_TABLE_ID': bq_settings.qps_table_id + } + + for i in range(1, num_instances + 1): + pod_name = '%s-%d' % (client_pod_name_prefix, i) + client_env['POD_NAME'] = pod_name + is_success = _launch_image_on_gke( + 'localhost', + kubernetes_proxy.get_port(), + 'default', + pod_name, + docker_image_name, + [metrics_port], # Client pods expose metrics port + client_cmd_list, + client_arg_list, + client_env, + False # Client is not a headless service. + ) + if not is_success: + print 'Error in launching client %s' % pod_name + return False + + return True + + +def _launch_server_and_client(gcp_project_id, docker_image_name, + num_client_instances): + # == Big Query tables related settings (Common for both server and client) == + + # Create a unique id for this run (Note: Using timestamp instead of UUID to + # make it easier to deduce the date/time of the run just by looking at the run + # run id. This is useful in debugging when looking at records in Biq query) + run_id = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + + dataset_id = 'stress_test_%s' % run_id + summary_table_id = 'summary' + qps_table_id = 'qps' + + bq_settings = BigQuerySettings(run_id, dataset_id, summary_table_id, + qps_table_id) + + # Start kubernetes proxy + kubernetes_api_port = 9001 + kubernetes_proxy = KubernetesProxy(kubernetes_api_port) + kubernetes_proxy.start() + + server_pod_name = 'stress-server' + server_port = 8080 + is_success = _launch_server(gcp_project_id, docker_image_name, bq_settings, + kubernetes_proxy, server_pod_name, server_port) + if not is_success: + print 'Error in launching server' + return False + + # Server takes a while to start. + # TODO(sree) Use Kubernetes API to query the status of the server instead of + # sleeping + time.sleep(60) + + # Launch client + server_address = '%s.default.svc.cluster.local:%d' % (server_pod_name, + server_port) + client_pod_name_prefix = 'stress-client' + is_success = _launch_client(gcp_project_id, docker_image_name, bq_settings, + kubernetes_proxy, num_client_instances, + client_pod_name_prefix, server_pod_name, + server_port) + if not is_success: + print 'Error in launching client(s)' + return False + + return True + + +def _delete_server_and_client(num_client_instances): + kubernetes_api_port = 9001 + kubernetes_proxy = KubernetesProxy(kubernetes_api_port) + kubernetes_proxy.start() + + # Delete clients first + client_pod_names = ['stress-client-%d' % i + for i in range(1, num_client_instances + 1)] + + is_success = _delete_image_on_gke(kubernetes_proxy, client_pod_names) + if not is_success: + return False + + # Delete server + server_pod_name = 'stress-server' + return _delete_image_on_gke(kubernetes_proxy, [server_pod_name]) + + +def _build_and_push_docker_image(gcp_project_id, docker_image_name, tag_name): + is_success = _build_docker_image(docker_image_name, tag_name) + if not is_success: + return False + return _push_docker_image_to_gke_registry(tag_name) + + +# TODO(sree): This is just to test the above APIs. Rewrite this to make +# everything configurable (like image names / number of instances etc) +def test_run(): + image_name = 'grpc_stress_test' + gcp_project_id = 'sree-gce' + tag_name = 'gcr.io/%s/%s' % (gcp_project_id, image_name) + num_client_instances = 3 + + is_success = _build_docker_image(image_name, tag_name) + if not is_success: + return + + is_success = _push_docker_image_to_gke_registry(tag_name) + if not is_success: + return + + is_success = _launch_server_and_client(gcp_project_id, tag_name, + num_client_instances) + + # Run the test for 2 mins + time.sleep(120) + + is_success = _delete_server_and_client(num_client_instances) + + if not is_success: + return + + +if __name__ == '__main__': + test_run() diff --git a/tools/run_tests/stress_test/run_client.py b/tools/run_tests/stress_test/run_client.py new file mode 100755 index 0000000000..33958bce49 --- /dev/null +++ b/tools/run_tests/stress_test/run_client.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python2.7 +# Copyright 2015-2016, Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import datetime +import os +import re +import select +import subprocess +import sys +import time + +from stress_test_utils import EventType +from stress_test_utils import BigQueryHelper + + +# TODO (sree): Write a python grpc client to directly query the metrics instead +# of calling metrics_client +def _get_qps(metrics_cmd): + qps = 0 + try: + # Note: gpr_log() writes even non-error messages to stderr stream. So it is + # important that we set stderr=subprocess.STDOUT + p = subprocess.Popen(args=metrics_cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + retcode = p.wait() + (out_str, err_str) = p.communicate() + if retcode != 0: + print 'Error in reading metrics information' + print 'Output: ', out_str + else: + # The overall qps is printed at the end of the line + m = re.search('\d+$', out_str) + qps = int(m.group()) if m else 0 + except Exception as ex: + print 'Exception while reading metrics information: ' + str(ex) + return qps + + +def run_client(): + """This is a wrapper around the stress test client and performs the following: + 1) Create the following two tables in Big Query: + (i) Summary table: To record events like the test started, completed + successfully or failed + (ii) Qps table: To periodically record the QPS sent by this client + 2) Start the stress test client and add a row in the Big Query summary + table + 3) Once every few seconds (as specificed by the poll_interval_secs) poll + the status of the stress test client process and perform the + following: + 3.1) If the process is still running, get the current qps by invoking + the metrics client program and add a row in the Big Query + Qps table. Sleep for a duration specified by poll_interval_secs + 3.2) If the process exited successfully, add a row in the Big Query + Summary table and exit + 3.3) If the process failed, add a row in Big Query summary table and + wait forever. + NOTE: This script typically runs inside a GKE pod which means + that the pod gets destroyed when the script exits. However, in + case the stress test client fails, we would not want the pod to + be destroyed (since we might want to connect to the pod for + examining logs). This is the reason why the script waits forever + in case of failures + """ + env = dict(os.environ) + image_type = env['STRESS_TEST_IMAGE_TYPE'] + image_name = env['STRESS_TEST_IMAGE'] + args_str = env['STRESS_TEST_ARGS_STR'] + metrics_client_image = env['METRICS_CLIENT_IMAGE'] + metrics_client_args_str = env['METRICS_CLIENT_ARGS_STR'] + run_id = env['RUN_ID'] + pod_name = env['POD_NAME'] + logfile_name = env.get('LOGFILE_NAME') + poll_interval_secs = float(env['POLL_INTERVAL_SECS']) + project_id = env['GCP_PROJECT_ID'] + dataset_id = env['DATASET_ID'] + summary_table_id = env['SUMMARY_TABLE_ID'] + qps_table_id = env['QPS_TABLE_ID'] + + bq_helper = BigQueryHelper(run_id, image_type, pod_name, project_id, + dataset_id, summary_table_id, qps_table_id) + bq_helper.initialize() + + # Create BigQuery Dataset and Tables: Summary Table and Metrics Table + if not bq_helper.setup_tables(): + print 'Error in creating BigQuery tables' + return + + start_time = datetime.datetime.now() + + logfile = None + details = 'Logging to stdout' + if logfile_name is not None: + print 'Opening logfile: %s ...' % logfile_name + details = 'Logfile: %s' % logfile_name + logfile = open(logfile_name, 'w') + + # Update status that the test is starting (in the status table) + bq_helper.insert_summary_row(EventType.STARTING, details) + + metrics_cmd = [metrics_client_image + ] + [x for x in metrics_client_args_str.split()] + stress_cmd = [image_name] + [x for x in args_str.split()] + + print 'Launching process %s ...' % stress_cmd + stress_p = subprocess.Popen(args=stress_cmd, + stdout=logfile, + stderr=subprocess.STDOUT) + + qps_history = [1, 1, 1] # Maintain the last 3 qps readings + qps_history_idx = 0 # Index into the qps_history list + + is_error = False + while True: + # Check if stress_client is still running. If so, collect metrics and upload + # to BigQuery status table + if stress_p.poll() is not None: + # TODO(sree) Upload completion status to BigQuery + end_time = datetime.datetime.now().isoformat() + event_type = EventType.SUCCESS + details = 'End time: %s' % end_time + if stress_p.returncode != 0: + event_type = EventType.FAILURE + details = 'Return code = %d. End time: %s' % (stress_p.returncode, + end_time) + is_error = True + bq_helper.insert_summary_row(event_type, details) + print details + break + + # Stress client still running. Get metrics + qps = _get_qps(metrics_cmd) + qps_recorded_at = datetime.datetime.now().isoformat() + print 'qps: %d at %s' % (qps, qps_recorded_at) + + # If QPS has been zero for the last 3 iterations, flag it as error and exit + qps_history[qps_history_idx] = qps + qps_history_idx = (qps_history_idx + 1) % len(qps_history) + if sum(qps_history) == 0: + details = 'QPS has been zero for the last %d seconds - as of : %s' % ( + poll_interval_secs * 3, qps_recorded_at) + is_error = True + bq_helper.insert_summary_row(EventType.FAILURE, details) + print details + break + + # Upload qps metrics to BiqQuery + bq_helper.insert_qps_row(qps, qps_recorded_at) + + time.sleep(poll_interval_secs) + + if is_error: + print 'Waiting indefinitely..' + select.select([], [], []) + + print 'Completed' + return + + +if __name__ == '__main__': + run_client() diff --git a/tools/run_tests/stress_test/run_server.py b/tools/run_tests/stress_test/run_server.py new file mode 100755 index 0000000000..9ad8d63638 --- /dev/null +++ b/tools/run_tests/stress_test/run_server.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python2.7 +# Copyright 2015-2016, Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import datetime +import os +import select +import subprocess +import sys +import time + +from stress_test_utils import BigQueryHelper +from stress_test_utils import EventType + + +def run_server(): + """This is a wrapper around the interop server and performs the following: + 1) Create a 'Summary table' in Big Query to record events like the server + started, completed successfully or failed. NOTE: This also creates + another table called the QPS table which is currently NOT needed on the + server (it is needed on the stress test clients) + 2) Start the server process and add a row in Big Query summary table + 3) Wait for the server process to terminate. The server process does not + terminate unless there is an error. + If the server process terminated with a failure, add a row in Big Query + and wait forever. + NOTE: This script typically runs inside a GKE pod which means that the + pod gets destroyed when the script exits. However, in case the server + process fails, we would not want the pod to be destroyed (since we + might want to connect to the pod for examining logs). This is the + reason why the script waits forever in case of failures. + """ + + # Read the parameters from environment variables + env = dict(os.environ) + + run_id = env['RUN_ID'] # The unique run id for this test + image_type = env['STRESS_TEST_IMAGE_TYPE'] + image_name = env['STRESS_TEST_IMAGE'] + args_str = env['STRESS_TEST_ARGS_STR'] + pod_name = env['POD_NAME'] + project_id = env['GCP_PROJECT_ID'] + dataset_id = env['DATASET_ID'] + summary_table_id = env['SUMMARY_TABLE_ID'] + qps_table_id = env['QPS_TABLE_ID'] + + logfile_name = env.get('LOGFILE_NAME') + + bq_helper = BigQueryHelper(run_id, image_type, pod_name, project_id, + dataset_id, summary_table_id, qps_table_id) + bq_helper.initialize() + + # Create BigQuery Dataset and Tables: Summary Table and Metrics Table + if not bq_helper.setup_tables(): + print 'Error in creating BigQuery tables' + return + + start_time = datetime.datetime.now() + + logfile = None + details = 'Logging to stdout' + if logfile_name is not None: + print 'Opening log file: ', logfile_name + logfile = open(logfile_name, 'w') + details = 'Logfile: %s' % logfile_name + + # Update status that the test is starting (in the status table) + bq_helper.insert_summary_row(EventType.STARTING, details) + + stress_cmd = [image_name] + [x for x in args_str.split()] + + print 'Launching process %s ...' % stress_cmd + stress_p = subprocess.Popen(args=stress_cmd, + stdout=logfile, + stderr=subprocess.STDOUT) + + returncode = stress_p.wait() + if returncode != 0: + end_time = datetime.datetime.now().isoformat() + event_type = EventType.FAILURE + details = 'Returncode: %d; End time: %s' % (returncode, end_time) + bq_helper.insert_summary_row(event_type, details) + print 'Waiting indefinitely..' + select.select([], [], []) + return returncode + + +if __name__ == '__main__': + run_server() diff --git a/tools/run_tests/stress_test/stress_test_utils.py b/tools/run_tests/stress_test/stress_test_utils.py new file mode 100755 index 0000000000..a0626ce3ac --- /dev/null +++ b/tools/run_tests/stress_test/stress_test_utils.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python2.7 +# Copyright 2015-2016, Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import datetime +import json +import os +import re +import select +import subprocess +import sys +import time + +# Import big_query_utils module +bq_utils_dir = os.path.abspath(os.path.join( + os.path.dirname(__file__), '../../big_query')) +sys.path.append(bq_utils_dir) +import big_query_utils as bq_utils + +class EventType: + STARTING = 'STARTING' + SUCCESS = 'SUCCESS' + FAILURE = 'FAILURE' + +class BigQueryHelper: + """Helper class for the stress test wrappers to interact with BigQuery. + """ + + def __init__(self, run_id, image_type, pod_name, project_id, dataset_id, + summary_table_id, qps_table_id): + self.run_id = run_id + self.image_type = image_type + self.pod_name = pod_name + self.project_id = project_id + self.dataset_id = dataset_id + self.summary_table_id = summary_table_id + self.qps_table_id = qps_table_id + + def initialize(self): + self.bq = bq_utils.create_big_query() + + def setup_tables(self): + return bq_utils.create_dataset(self.bq, self.project_id, self.dataset_id) \ + and self.__create_summary_table() \ + and self.__create_qps_table() + + def insert_summary_row(self, event_type, details): + row_values_dict = { + 'run_id': self.run_id, + 'image_type': self.image_type, + 'pod_name': self.pod_name, + 'event_date': datetime.datetime.now().isoformat(), + 'event_type': event_type, + 'details': details + } + # Something that uniquely identifies the row (Biquery needs it for duplicate + # detection). + row_unique_id = '%s_%s_%s' % (self.run_id, self.pod_name, event_type) + + row = bq_utils.make_row(row_unique_id, row_values_dict) + return bq_utils.insert_rows(self.bq, self.project_id, self.dataset_id, + self.summary_table_id, [row]) + + def insert_qps_row(self, qps, recorded_at): + row_values_dict = { + 'run_id': self.run_id, + 'pod_name': self.pod_name, + 'recorded_at': recorded_at, + 'qps': qps + } + + row_unique_id = '%s_%s_%s' % (self.run_id, self.pod_name, recorded_at) + row = bq_utils.make_row(row_unique_id, row_values_dict) + return bq_utils.insert_rows(self.bq, self.project_id, self.dataset_id, + self.qps_table_id, [row]) + + def check_if_any_tests_failed(self, num_query_retries=3): + query = ('SELECT event_type FROM %s.%s WHERE run_id = %s AND ' + 'event_type="%s"') % (self.dataset_id, self.summary_table_id, + self.run_id, EventType.FAILURE) + query_job = bq_utils.sync_query_job(self.bq, self.project_id, query) + page = self.bq.jobs().getQueryResults(**query_job['jobReference']).execute( + num_retries=num_query_retries) + print page + num_failures = int(page['totalRows']) + print 'num rows: ', num_failures + return num_failures > 0 + + def print_summary_records(self, num_query_retries=3): + line = '-' * 120 + print line + print 'Summary records' + print 'Run Id', self.run_id + print line + query = ('SELECT pod_name, image_type, event_type, event_date, details' + ' FROM %s.%s WHERE run_id = %s ORDER by event_date;') % ( + self.dataset_id, self.summary_table_id, self.run_id) + query_job = bq_utils.sync_query_job(self.bq, self.project_id, query) + + print '{:<25} {:<12} {:<12} {:<30} {}'.format( + 'Pod name', 'Image type', 'Event type', 'Date', 'Details') + print line + page_token = None + while True: + page = self.bq.jobs().getQueryResults( + pageToken=page_token, + **query_job['jobReference']).execute(num_retries=num_query_retries) + rows = page.get('rows', []) + for row in rows: + print '{:<25} {:<12} {:<12} {:<30} {}'.format( + row['f'][0]['v'], row['f'][1]['v'], row['f'][2]['v'], + row['f'][3]['v'], row['f'][4]['v']) + page_token = page.get('pageToken') + if not page_token: + break + + def print_qps_records(self, num_query_retries=3): + line = '-' * 80 + print line + print 'QPS Summary' + print 'Run Id: ', self.run_id + print line + query = ( + 'SELECT pod_name, recorded_at, qps FROM %s.%s WHERE run_id = %s ORDER ' + 'by recorded_at;') % (self.dataset_id, self.qps_table_id, self.run_id) + query_job = bq_utils.sync_query_job(self.bq, self.project_id, query) + print '{:<25} {:30} {}'.format('Pod name', 'Recorded at', 'Qps') + print line + page_token = None + while True: + page = self.bq.jobs().getQueryResults( + pageToken=page_token, + **query_job['jobReference']).execute(num_retries=num_query_retries) + rows = page.get('rows', []) + for row in rows: + print '{:<25} {:30} {}'.format(row['f'][0]['v'], row['f'][1]['v'], + row['f'][2]['v']) + page_token = page.get('pageToken') + if not page_token: + break + + def __create_summary_table(self): + summary_table_schema = [ + ('run_id', 'INTEGER', 'Test run id'), + ('image_type', 'STRING', 'Client or Server?'), + ('pod_name', 'STRING', 'GKE pod hosting this image'), + ('event_date', 'STRING', 'The date of this event'), + ('event_type', 'STRING', 'STARTED/SUCCESS/FAILURE'), + ('details', 'STRING', 'Any other relevant details') + ] + desc = ('The table that contains START/SUCCESS/FAILURE events for ' + ' the stress test clients and servers') + return bq_utils.create_table(self.bq, self.project_id, self.dataset_id, + self.summary_table_id, summary_table_schema, + desc) + + def __create_qps_table(self): + qps_table_schema = [ + ('run_id', 'INTEGER', 'Test run id'), + ('pod_name', 'STRING', 'GKE pod hosting this image'), + ('recorded_at', 'STRING', 'Metrics recorded at time'), + ('qps', 'INTEGER', 'Queries per second') + ] + desc = 'The table that cointains the qps recorded at various intervals' + return bq_utils.create_table(self.bq, self.project_id, self.dataset_id, + self.qps_table_id, qps_table_schema, desc) diff --git a/tools/run_tests/stress_test_wrapper.py b/tools/run_tests/stress_test_wrapper.py deleted file mode 100755 index 8f1bd2024e..0000000000 --- a/tools/run_tests/stress_test_wrapper.py +++ /dev/null @@ -1,96 +0,0 @@ -#!/usr/bin/env python2.7 -import os -import re -import select -import subprocess -import sys -import time - -GRPC_ROOT = '/usr/local/google/home/sreek/workspace/grpc/' -STRESS_TEST_IMAGE = GRPC_ROOT + 'bins/opt/stress_test' -STRESS_TEST_ARGS_STR = ' '.join([ - '--server_addresses=localhost:8000', - '--test_cases=empty_unary:1,large_unary:1', '--num_stubs_per_channel=10', - '--test_duration_secs=10']) -METRICS_CLIENT_IMAGE = GRPC_ROOT + 'bins/opt/metrics_client' -METRICS_CLIENT_ARGS_STR = ' '.join([ - '--metrics_server_address=localhost:8081', '--total_only=true']) -LOGFILE_NAME = 'stress_test.log' - - -# TODO (sree): Write a python grpc client to directly query the metrics instead -# of calling metrics_client -def get_qps(metrics_cmd): - qps = 0 - try: - # Note: gpr_log() writes even non-error messages to stderr stream. So it is - # important that we set stderr=subprocess.STDOUT - p = subprocess.Popen(args=metrics_cmd, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) - retcode = p.wait() - (out_str, err_str) = p.communicate() - if retcode != 0: - print 'Error in reading metrics information' - print 'Output: ', out_str - else: - # The overall qps is printed at the end of the line - m = re.search('\d+$', out_str) - qps = int(m.group()) if m else 0 - except Exception as ex: - print 'Exception while reading metrics information: ' + str(ex) - return qps - -def main(argv): - # TODO(sree) Create BigQuery Tables - # (Summary table), (Metrics table) - - # TODO(sree) Update status that the test is starting (in the status table) - # - - metrics_cmd = [METRICS_CLIENT_IMAGE - ] + [x for x in METRICS_CLIENT_ARGS_STR.split()] - - stress_cmd = [STRESS_TEST_IMAGE] + [x for x in STRESS_TEST_ARGS_STR.split()] - # TODO(sree): Add an option to print to stdout if logfilename is absent - logfile = open(LOGFILE_NAME, 'w') - stress_p = subprocess.Popen(args=arg_list, - stdout=logfile, - stderr=subprocess.STDOUT) - - qps_history = [1, 1, 1] # Maintain the last 3 qps - qps_history_idx = 0 # Index into the qps_history list - - is_error = False - while True: - # Check if stress_client is still running. If so, collect metrics and upload - # to BigQuery status table - # - if stress_p is not None: - # TODO(sree) Upload completion status to BiqQuery - is_error = (stress_p.returncode != 0) - break - - # Stress client still running. Get metrics - qps = get_qps(metrics_cmd) - - # If QPS has been zero for the last 3 iterations, flag it as error and exit - qps_history[qps_history_idx] = qps - qps_history_idx = (qps_histor_idx + 1) % len(qps_history) - if sum(a) == 0: - print ('QPS has been zero for the last 3 iterations. Not monitoring ' - 'anymore. The stress test client may be stalled.') - is_error = True - break - - #TODO(sree) Upload qps metrics to BiqQuery - - if is_error: - print 'Waiting indefinitely..' - select.select([],[],[]) - - return 1 - - -if __name__ == '__main__': - main(sys.argv[1:]) -- cgit v1.2.3 From 2715a39a2e7b5a47ff6726daadeac63c4664550e Mon Sep 17 00:00:00 2001 From: Sree Kuchibhotla Date: Wed, 24 Feb 2016 12:01:52 -0800 Subject: Change RUN_ID type to string to allow for a non-numeric run_id --- tools/big_query/big_query_utils.py | 2 +- tools/gke/run_stress_tests_on_gke.py | 103 +++++++++++++++-------- tools/run_tests/stress_test/run_server.py | 5 ++ tools/run_tests/stress_test/stress_test_utils.py | 17 ++-- 4 files changed, 85 insertions(+), 42 deletions(-) (limited to 'tools') diff --git a/tools/big_query/big_query_utils.py b/tools/big_query/big_query_utils.py index 267d019850..e2379fd1aa 100755 --- a/tools/big_query/big_query_utils.py +++ b/tools/big_query/big_query_utils.py @@ -135,6 +135,6 @@ def sync_query_job(big_query, project_id, query, timeout=5000): # List of (column name, column type, description) tuples def make_row(unique_row_id, row_values_dict): - """row_values_dict is a dictionar of column name and column value. + """row_values_dict is a dictionary of column name and column value. """ return {'insertId': unique_row_id, 'json': row_values_dict} diff --git a/tools/gke/run_stress_tests_on_gke.py b/tools/gke/run_stress_tests_on_gke.py index d0c3887a42..0ea7b7fcc1 100755 --- a/tools/gke/run_stress_tests_on_gke.py +++ b/tools/gke/run_stress_tests_on_gke.py @@ -33,11 +33,17 @@ import subprocess import sys import time +stress_test_utils_dir = os.path.abspath(os.path.join( + os.path.dirname(__file__), '../run_tests/stress_test')) +sys.path.append(stress_test_utils_dir) +from stress_test_utils import BigQueryHelper + import kubernetes_api GRPC_ROOT = os.path.abspath(os.path.join(os.path.dirname(sys.argv[0]), '../..')) os.chdir(GRPC_ROOT) + class BigQuerySettings: def __init__(self, run_id, dataset_id, summary_table_id, qps_table_id): @@ -283,27 +289,16 @@ def _launch_client(gcp_project_id, docker_image_name, bq_settings, return True -def _launch_server_and_client(gcp_project_id, docker_image_name, +def _launch_server_and_client(bq_settings, gcp_project_id, docker_image_name, num_client_instances): - # == Big Query tables related settings (Common for both server and client) == - - # Create a unique id for this run (Note: Using timestamp instead of UUID to - # make it easier to deduce the date/time of the run just by looking at the run - # run id. This is useful in debugging when looking at records in Biq query) - run_id = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') - - dataset_id = 'stress_test_%s' % run_id - summary_table_id = 'summary' - qps_table_id = 'qps' - - bq_settings = BigQuerySettings(run_id, dataset_id, summary_table_id, - qps_table_id) - # Start kubernetes proxy kubernetes_api_port = 9001 kubernetes_proxy = KubernetesProxy(kubernetes_api_port) kubernetes_proxy.start() + # num of seconds to wait for the GKE image to start and warmup + image_warmp_secs = 60 + server_pod_name = 'stress-server' server_port = 8080 is_success = _launch_server(gcp_project_id, docker_image_name, bq_settings, @@ -315,7 +310,8 @@ def _launch_server_and_client(gcp_project_id, docker_image_name, # Server takes a while to start. # TODO(sree) Use Kubernetes API to query the status of the server instead of # sleeping - time.sleep(60) + print 'Waiting for %s seconds for the server to start...' % image_warmp_secs + time.sleep(image_warmp_secs) # Launch client server_address = '%s.default.svc.cluster.local:%d' % (server_pod_name, @@ -329,6 +325,8 @@ def _launch_server_and_client(gcp_project_id, docker_image_name, print 'Error in launching client(s)' return False + print 'Waiting for %s seconds for the client images to start...' % image_warmp_secs + time.sleep(image_warmp_secs) return True @@ -359,31 +357,68 @@ def _build_and_push_docker_image(gcp_project_id, docker_image_name, tag_name): # TODO(sree): This is just to test the above APIs. Rewrite this to make # everything configurable (like image names / number of instances etc) -def test_run(): - image_name = 'grpc_stress_test' - gcp_project_id = 'sree-gce' - tag_name = 'gcr.io/%s/%s' % (gcp_project_id, image_name) - num_client_instances = 3 +def run_test(skip_building_image, gcp_project_id, image_name, tag_name, + num_client_instances, poll_interval_secs, total_duration_secs): + if not skip_building_image: + is_success = _build_docker_image(image_name, tag_name) + if not is_success: + return False - is_success = _build_docker_image(image_name, tag_name) - if not is_success: - return + is_success = _push_docker_image_to_gke_registry(tag_name) + if not is_success: + return False - is_success = _push_docker_image_to_gke_registry(tag_name) - if not is_success: - return + # == Big Query tables related settings (Common for both server and client) == + + # Create a unique id for this run (Note: Using timestamp instead of UUID to + # make it easier to deduce the date/time of the run just by looking at the run + # run id. This is useful in debugging when looking at records in Biq query) + run_id = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + dataset_id = 'stress_test_%s' % run_id + summary_table_id = 'summary' + qps_table_id = 'qps' + bq_settings = BigQuerySettings(run_id, dataset_id, summary_table_id, + qps_table_id) - is_success = _launch_server_and_client(gcp_project_id, tag_name, + bq_helper = BigQueryHelper(run_id, '', '', gcp_project_id, dataset_id, + summary_table_id, qps_table_id) + bq_helper.initialize() + is_success = _launch_server_and_client(bq_settings, gcp_project_id, tag_name, num_client_instances) + if not is_success: + return False + + start_time = datetime.datetime.now() + end_time = start_time + datetime.timedelta(seconds=total_duration_secs) - # Run the test for 2 mins - time.sleep(120) + while True: + if datetime.datetime.now() > end_time: + print 'Test was run for %d seconds' % total_duration_secs + break - is_success = _delete_server_and_client(num_client_instances) + # Check if either stress server or clients have failed + if not bq_helper.check_if_any_tests_failed(): + is_success = False + print 'Some tests failed.' + break + # Things seem to be running fine. Wait until next poll time to check the + # status + time.sleep(poll_interval_secs) - if not is_success: - return + # Print BiqQuery tables + bq_helper.print_summary_records() + bq_helper.print_qps_records() + + _delete_server_and_client(num_client_instances) + return is_success if __name__ == '__main__': - test_run() + image_name = 'grpc_stress_test' + gcp_project_id = 'sree-gce' + tag_name = 'gcr.io/%s/%s' % (gcp_project_id, image_name) + num_client_instances = 3 + poll_interval_secs = 5, + test_duration_secs = 150 + run_test(True, gcp_project_id, image_name, tag_name, num_client_instances, + poll_interval_secs, test_duration_secs) diff --git a/tools/run_tests/stress_test/run_server.py b/tools/run_tests/stress_test/run_server.py index 9ad8d63638..64322f6100 100755 --- a/tools/run_tests/stress_test/run_server.py +++ b/tools/run_tests/stress_test/run_server.py @@ -72,6 +72,11 @@ def run_server(): logfile_name = env.get('LOGFILE_NAME') + print('pod_name: %s, project_id: %s, run_id: %s, dataset_id: %s, ' + 'summary_table_id: %s, qps_table_id: %s') % ( + pod_name, project_id, run_id, dataset_id, summary_table_id, + qps_table_id) + bq_helper = BigQueryHelper(run_id, image_type, pod_name, project_id, dataset_id, summary_table_id, qps_table_id) bq_helper.initialize() diff --git a/tools/run_tests/stress_test/stress_test_utils.py b/tools/run_tests/stress_test/stress_test_utils.py index a0626ce3ac..71f0dcd921 100755 --- a/tools/run_tests/stress_test/stress_test_utils.py +++ b/tools/run_tests/stress_test/stress_test_utils.py @@ -43,11 +43,13 @@ bq_utils_dir = os.path.abspath(os.path.join( sys.path.append(bq_utils_dir) import big_query_utils as bq_utils + class EventType: STARTING = 'STARTING' SUCCESS = 'SUCCESS' FAILURE = 'FAILURE' + class BigQueryHelper: """Helper class for the stress test wrappers to interact with BigQuery. """ @@ -101,9 +103,9 @@ class BigQueryHelper: self.qps_table_id, [row]) def check_if_any_tests_failed(self, num_query_retries=3): - query = ('SELECT event_type FROM %s.%s WHERE run_id = %s AND ' + query = ('SELECT event_type FROM %s.%s WHERE run_id = \'%s\' AND ' 'event_type="%s"') % (self.dataset_id, self.summary_table_id, - self.run_id, EventType.FAILURE) + self.run_id, EventType.FAILURE) query_job = bq_utils.sync_query_job(self.bq, self.project_id, query) page = self.bq.jobs().getQueryResults(**query_job['jobReference']).execute( num_retries=num_query_retries) @@ -119,7 +121,7 @@ class BigQueryHelper: print 'Run Id', self.run_id print line query = ('SELECT pod_name, image_type, event_type, event_date, details' - ' FROM %s.%s WHERE run_id = %s ORDER by event_date;') % ( + ' FROM %s.%s WHERE run_id = \'%s\' ORDER by event_date;') % ( self.dataset_id, self.summary_table_id, self.run_id) query_job = bq_utils.sync_query_job(self.bq, self.project_id, query) @@ -147,8 +149,9 @@ class BigQueryHelper: print 'Run Id: ', self.run_id print line query = ( - 'SELECT pod_name, recorded_at, qps FROM %s.%s WHERE run_id = %s ORDER ' - 'by recorded_at;') % (self.dataset_id, self.qps_table_id, self.run_id) + 'SELECT pod_name, recorded_at, qps FROM %s.%s WHERE run_id = \'%s\' ' + 'ORDER by recorded_at;') % (self.dataset_id, self.qps_table_id, + self.run_id) query_job = bq_utils.sync_query_job(self.bq, self.project_id, query) print '{:<25} {:30} {}'.format('Pod name', 'Recorded at', 'Qps') print line @@ -167,7 +170,7 @@ class BigQueryHelper: def __create_summary_table(self): summary_table_schema = [ - ('run_id', 'INTEGER', 'Test run id'), + ('run_id', 'STRING', 'Test run id'), ('image_type', 'STRING', 'Client or Server?'), ('pod_name', 'STRING', 'GKE pod hosting this image'), ('event_date', 'STRING', 'The date of this event'), @@ -182,7 +185,7 @@ class BigQueryHelper: def __create_qps_table(self): qps_table_schema = [ - ('run_id', 'INTEGER', 'Test run id'), + ('run_id', 'STRING', 'Test run id'), ('pod_name', 'STRING', 'GKE pod hosting this image'), ('recorded_at', 'STRING', 'Metrics recorded at time'), ('qps', 'INTEGER', 'Queries per second') -- cgit v1.2.3 From 8bcbee815c5d7aa6cda13371241536db6dc03fbb Mon Sep 17 00:00:00 2001 From: Sree Kuchibhotla Date: Wed, 24 Feb 2016 15:52:54 -0800 Subject: Fix a bug in failure check --- tools/gke/run_stress_tests_on_gke.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/gke/run_stress_tests_on_gke.py b/tools/gke/run_stress_tests_on_gke.py index 0ea7b7fcc1..9a8a33cac5 100755 --- a/tools/gke/run_stress_tests_on_gke.py +++ b/tools/gke/run_stress_tests_on_gke.py @@ -244,7 +244,7 @@ def _launch_client(gcp_project_id, docker_image_name, bq_settings, client_arg_list = [] # TODO(sree) Make this configurable (and also less frequent) - poll_interval_secs = 5 + poll_interval_secs = 30 metrics_port = 8081 metrics_server_address = 'localhost:%d' % metrics_port @@ -397,7 +397,7 @@ def run_test(skip_building_image, gcp_project_id, image_name, tag_name, break # Check if either stress server or clients have failed - if not bq_helper.check_if_any_tests_failed(): + if bq_helper.check_if_any_tests_failed(): is_success = False print 'Some tests failed.' break -- cgit v1.2.3 From f63c49238e1696d2f74c68d6323e959ceb8b8791 Mon Sep 17 00:00:00 2001 From: Sree Kuchibhotla Date: Wed, 24 Feb 2016 17:31:52 -0800 Subject: Some more cleanup --- tools/gke/run_stress_tests_on_gke.py | 3 ++- tools/run_tests/stress_test/stress_test_utils.py | 12 +++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/gke/run_stress_tests_on_gke.py b/tools/gke/run_stress_tests_on_gke.py index 9a8a33cac5..065b11e91c 100755 --- a/tools/gke/run_stress_tests_on_gke.py +++ b/tools/gke/run_stress_tests_on_gke.py @@ -403,6 +403,7 @@ def run_test(skip_building_image, gcp_project_id, image_name, tag_name, break # Things seem to be running fine. Wait until next poll time to check the # status + print 'Sleeping for %d seconds..' % poll_interval_secs time.sleep(poll_interval_secs) # Print BiqQuery tables @@ -418,7 +419,7 @@ if __name__ == '__main__': gcp_project_id = 'sree-gce' tag_name = 'gcr.io/%s/%s' % (gcp_project_id, image_name) num_client_instances = 3 - poll_interval_secs = 5, + poll_interval_secs = 10 test_duration_secs = 150 run_test(True, gcp_project_id, image_name, tag_name, num_client_instances, poll_interval_secs, test_duration_secs) diff --git a/tools/run_tests/stress_test/stress_test_utils.py b/tools/run_tests/stress_test/stress_test_utils.py index 71f0dcd921..7adc0068f9 100755 --- a/tools/run_tests/stress_test/stress_test_utils.py +++ b/tools/run_tests/stress_test/stress_test_utils.py @@ -81,10 +81,9 @@ class BigQueryHelper: 'event_type': event_type, 'details': details } - # Something that uniquely identifies the row (Biquery needs it for duplicate - # detection). + # row_unique_id is something that uniquely identifies the row (BigQuery uses + # it for duplicate detection). row_unique_id = '%s_%s_%s' % (self.run_id, self.pod_name, event_type) - row = bq_utils.make_row(row_unique_id, row_values_dict) return bq_utils.insert_rows(self.bq, self.project_id, self.dataset_id, self.summary_table_id, [row]) @@ -97,6 +96,8 @@ class BigQueryHelper: 'qps': qps } + # row_unique_id is something that uniquely identifies the row (BigQuery uses + # it for duplicate detection). row_unique_id = '%s_%s_%s' % (self.run_id, self.pod_name, recorded_at) row = bq_utils.make_row(row_unique_id, row_values_dict) return bq_utils.insert_rows(self.bq, self.project_id, self.dataset_id, @@ -109,7 +110,6 @@ class BigQueryHelper: query_job = bq_utils.sync_query_job(self.bq, self.project_id, query) page = self.bq.jobs().getQueryResults(**query_job['jobReference']).execute( num_retries=num_query_retries) - print page num_failures = int(page['totalRows']) print 'num rows: ', num_failures return num_failures > 0 @@ -118,7 +118,8 @@ class BigQueryHelper: line = '-' * 120 print line print 'Summary records' - print 'Run Id', self.run_id + print 'Run Id: ', self.run_id + print 'Dataset Id: ', self.dataset_id print line query = ('SELECT pod_name, image_type, event_type, event_date, details' ' FROM %s.%s WHERE run_id = \'%s\' ORDER by event_date;') % ( @@ -147,6 +148,7 @@ class BigQueryHelper: print line print 'QPS Summary' print 'Run Id: ', self.run_id + print 'Dataset Id: ', self.dataset_id print line query = ( 'SELECT pod_name, recorded_at, qps FROM %s.%s WHERE run_id = \'%s\' ' -- cgit v1.2.3 From 61c134f5f83a1897264016ef765e82b61ebd3992 Mon Sep 17 00:00:00 2001 From: Sree Kuchibhotla Date: Fri, 26 Feb 2016 11:03:29 -0800 Subject: Significantly rewrite tools/gke/run_stress_tests_on_gke.py and make everything configurable --- tools/gke/kubernetes_api.py | 47 ++- tools/gke/run_stress_tests_on_gke.py | 541 ++++++++++++++++++++++------------- 2 files changed, 387 insertions(+), 201 deletions(-) (limited to 'tools') diff --git a/tools/gke/kubernetes_api.py b/tools/gke/kubernetes_api.py index d14c26ad6a..e1017e9da6 100755 --- a/tools/gke/kubernetes_api.py +++ b/tools/gke/kubernetes_api.py @@ -50,7 +50,8 @@ def _make_pod_config(pod_name, image_name, container_port_list, cmd_list, 'name': pod_name, 'image': image_name, 'ports': [{'containerPort': port, - 'protocol': 'TCP'} for port in container_port_list], + 'protocol': 'TCP'} + for port in container_port_list], 'imagePullPolicy': 'Always' } ] @@ -222,3 +223,47 @@ def delete_pod(kube_host, kube_port, namespace, pod_name): del_url = 'http://%s:%d/api/v1/namespaces/%s/pods/%s' % (kube_host, kube_port, namespace, pod_name) return _do_delete(del_url, 'Delete Pod') + + +def create_pod_and_service(kube_host, kube_port, namespace, pod_name, + image_name, container_port_list, cmd_list, arg_list, + env_dict, is_headless_service): + """A simple helper function that creates a pod and a service (if pod creation was successful).""" + is_success = create_pod(kube_host, kube_port, namespace, pod_name, image_name, + container_port_list, cmd_list, arg_list, env_dict) + if not is_success: + print 'Error in creating Pod' + return False + + is_success = create_service( + kube_host, + kube_port, + namespace, + pod_name, # Use pod_name for service + pod_name, + container_port_list, # Service port list same as container port list + container_port_list, + is_headless_service) + if not is_success: + print 'Error in creating Service' + return False + + print 'Successfully created the pod/service %s' % pod_name + return True + + +def delete_pod_and_service(kube_host, kube_port, namespace, pod_name): + """ A simple helper function that calls delete_pod and delete_service """ + is_success = delete_pod(kube_host, kube_port, namespace, pod_name) + if not is_success: + print 'Error in deleting pod %s' % pod_name + return False + + # Note: service name assumed to the the same as pod name + is_success = delete_service(kube_host, kube_port, namespace, pod_name) + if not is_success: + print 'Error in deleting service %s' % pod_name + return False + + print 'Successfully deleted the Pod/Service: %s' % pod_name + return True diff --git a/tools/gke/run_stress_tests_on_gke.py b/tools/gke/run_stress_tests_on_gke.py index 065b11e91c..cf0ef595a6 100755 --- a/tools/gke/run_stress_tests_on_gke.py +++ b/tools/gke/run_stress_tests_on_gke.py @@ -27,6 +27,7 @@ # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import argparse import datetime import os import subprocess @@ -40,17 +41,52 @@ from stress_test_utils import BigQueryHelper import kubernetes_api -GRPC_ROOT = os.path.abspath(os.path.join(os.path.dirname(sys.argv[0]), '../..')) -os.chdir(GRPC_ROOT) +_GRPC_ROOT = os.path.abspath(os.path.join( + os.path.dirname(sys.argv[0]), '../..')) +os.chdir(_GRPC_ROOT) +# num of seconds to wait for the GKE image to start and warmup +_GKE_IMAGE_WARMUP_WAIT_SECS = 60 -class BigQuerySettings: +_SERVER_POD_NAME = 'stress-server' +_CLIENT_POD_NAME_PREFIX = 'stress-client' +_DATASET_ID_PREFIX = 'stress_test' +_SUMMARY_TABLE_ID = 'summary' +_QPS_TABLE_ID = 'qps' - def __init__(self, run_id, dataset_id, summary_table_id, qps_table_id): - self.run_id = run_id - self.dataset_id = dataset_id - self.summary_table_id = summary_table_id - self.qps_table_id = qps_table_id +_DEFAULT_DOCKER_IMAGE_NAME = 'grpc_stress_test' + +# The default port on which the kubernetes proxy server is started on localhost +# (i.e kubectl proxy --port=) +_DEFAULT_KUBERNETES_PROXY_PORT = 8001 + +# How frequently should the stress client wrapper script (running inside a GKE +# container) poll the health of the stress client (also running inside the GKE +# container) and upload metrics to BigQuery +_DEFAULT_STRESS_CLIENT_POLL_INTERVAL_SECS = 60 + +# The default setting for stress test server and client +_DEFAULT_STRESS_SERVER_PORT = 8080 +_DEFAULT_METRICS_PORT = 8081 +_DEFAULT_TEST_CASES_STR = 'empty_unary:1,large_unary:1,client_streaming:1,server_streaming:1,empty_stream:1' +_DEFAULT_NUM_CHANNELS_PER_SERVER = 5 +_DEFAULT_NUM_STUBS_PER_CHANNEL = 10 +_DEFAULT_METRICS_COLLECTION_INTERVAL_SECS = 30 + +# Number of stress client instances to launch +_DEFAULT_NUM_CLIENTS = 3 + +# How frequently should this test monitor the health of Stress clients and +# Servers running in GKE +_DEFAULT_TEST_POLL_INTERVAL_SECS = 60 + +# Default run time for this test (2 hour) +_DEFAULT_TEST_DURATION_SECS = 7200 + +# The number of seconds it would take a GKE pod to warm up (i.e get to 'Running' +# state from the time of creation). Ideally this is something the test should +# automatically determine by using Kubernetes API to poll the pods status. +_DEFAULT_GKE_WARMUP_SECS = 60 class KubernetesProxy: @@ -76,11 +112,74 @@ class KubernetesProxy: def __del__(self): if self.p is not None: + print 'Shutting down Kubernetes proxy..' self.p.kill() +class TestSettings: + + def __init__(self, build_docker_image, test_poll_interval_secs, + test_duration_secs, kubernetes_proxy_port): + self.build_docker_image = build_docker_image + self.test_poll_interval_secs = test_poll_interval_secs + self.test_duration_secs = test_duration_secs + self.kubernetes_proxy_port = kubernetes_proxy_port + + +class GkeSettings: + + def __init__(self, project_id, docker_image_name): + self.project_id = project_id + self.docker_image_name = docker_image_name + self.tag_name = 'gcr.io/%s/%s' % (project_id, docker_image_name) + + +class BigQuerySettings: + + def __init__(self, run_id, dataset_id, summary_table_id, qps_table_id): + self.run_id = run_id + self.dataset_id = dataset_id + self.summary_table_id = summary_table_id + self.qps_table_id = qps_table_id + + +class StressServerSettings: + + def __init__(self, server_pod_name, server_port): + self.server_pod_name = server_pod_name + self.server_port = server_port + + +class StressClientSettings: + + def __init__(self, num_clients, client_pod_name_prefix, server_pod_name, + server_port, metrics_port, metrics_collection_interval_secs, + stress_client_poll_interval_secs, num_channels_per_server, + num_stubs_per_channel, test_cases_str): + self.num_clients = num_clients + self.client_pod_name_prefix = client_pod_name_prefix + self.server_pod_name = server_pod_name + self.server_port = server_port + self.metrics_port = metrics_port + self.metrics_collection_interval_secs = metrics_collection_interval_secs + self.stress_client_poll_interval_secs = stress_client_poll_interval_secs + self.num_channels_per_server = num_channels_per_server + self.num_stubs_per_channel = num_stubs_per_channel + self.test_cases_str = test_cases_str + + # == Derived properties == + # Note: Client can accept a list of server addresses (a comma separated list + # of 'server_name:server_port'). In this case, we only have one server + # address to pass + self.server_addresses = '%s.default.svc.cluster.local:%d' % ( + server_pod_name, server_port) + self.client_pod_names_list = ['%s-%d' % (client_pod_name_prefix, i) + for i in range(1, num_clients + 1)] + + def _build_docker_image(image_name, tag_name): """ Build the docker image and add a tag """ + print 'Building docker image: %s' % image_name os.environ['INTEROP_IMAGE'] = image_name # Note that 'BASE_NAME' HAS to be 'grpc_interop_stress_cxx' since the script # build_interop_stress_image.sh invokes the following script: @@ -93,6 +192,7 @@ def _build_docker_image(image_name, tag_name): print 'Error in building docker image' return False + print 'Adding an additional tag %s to the image %s' % (tag_name, image_name) cmd = ['docker', 'tag', '-f', image_name, tag_name] p = subprocess.Popen(args=cmd) retcode = p.wait() @@ -115,144 +215,86 @@ def _push_docker_image_to_gke_registry(docker_tag_name): return True -def _launch_image_on_gke(kubernetes_api_server, kubernetes_api_port, namespace, - pod_name, image_name, port_list, cmd_list, arg_list, - env_dict, is_headless_service): - """Creates a GKE Pod and a Service object for a given image by calling Kubernetes API""" - is_success = kubernetes_api.create_pod( - kubernetes_api_server, - kubernetes_api_port, - namespace, - pod_name, - image_name, - port_list, # The ports to be exposed on this container/pod - cmd_list, # The command that launches the stress server - arg_list, - env_dict # Environment variables to be passed to the pod - ) - if not is_success: - print 'Error in creating Pod' - return False - - is_success = kubernetes_api.create_service( - kubernetes_api_server, - kubernetes_api_port, - namespace, - pod_name, # Use the pod name for service name as well - pod_name, - port_list, # Service port list - port_list, # Container port list (same as service port list) - is_headless_service) - if not is_success: - print 'Error in creating Service' - return False - - print 'Successfully created the pod/service %s' % pod_name - return True - - -def _delete_image_on_gke(kubernetes_proxy, pod_name_list): - """Deletes a GKE Pod and Service object for given list of Pods by calling Kubernetes API""" - if not kubernetes_proxy.is_started: - print 'Kubernetes proxy must be started before calling this function' - return False - - is_success = True - for pod_name in pod_name_list: - is_success = kubernetes_api.delete_pod( - 'localhost', kubernetes_proxy.get_port(), 'default', pod_name) - if not is_success: - print 'Error in deleting pod %s' % pod_name - break - - is_success = kubernetes_api.delete_service( - 'localhost', kubernetes_proxy.get_port(), 'default', - pod_name) # service name same as pod name - if not is_success: - print 'Error in deleting service %s' % pod_name - break - - if is_success: - print 'Successfully deleted the Pods/Services: %s' % ','.join(pod_name_list) - - return is_success - - -def _launch_server(gcp_project_id, docker_image_name, bq_settings, - kubernetes_proxy, server_pod_name, server_port): +def _launch_server(gke_settings, stress_server_settings, bq_settings, + kubernetes_proxy): """ Launches a stress test server instance in GKE cluster """ if not kubernetes_proxy.is_started: print 'Kubernetes proxy must be started before calling this function' return False + # This is the wrapper script that is run in the container. This script runs + # the actual stress test server server_cmd_list = [ '/var/local/git/grpc/tools/run_tests/stress_test/run_server.py' - ] # Process that is launched - server_arg_list = [] # run_server.py does not take any args (for now) + ] - # == Parameters to the server process launched in GKE == + # run_server.py does not take any args from the command line. The args are + # instead passed via environment variables (see server_env below) + server_arg_list = [] + + # The parameters to the script run_server.py are injected into the container + # via environment variables server_env = { 'STRESS_TEST_IMAGE_TYPE': 'SERVER', 'STRESS_TEST_IMAGE': '/var/local/git/grpc/bins/opt/interop_server', - 'STRESS_TEST_ARGS_STR': '--port=%s' % server_port, + 'STRESS_TEST_ARGS_STR': '--port=%s' % stress_server_settings.server_port, 'RUN_ID': bq_settings.run_id, - 'POD_NAME': server_pod_name, - 'GCP_PROJECT_ID': gcp_project_id, + 'POD_NAME': stress_server_settings.server_pod_name, + 'GCP_PROJECT_ID': gke_settings.project_id, 'DATASET_ID': bq_settings.dataset_id, 'SUMMARY_TABLE_ID': bq_settings.summary_table_id, 'QPS_TABLE_ID': bq_settings.qps_table_id } # Launch Server - is_success = _launch_image_on_gke( + is_success = kubernetes_api.create_pod_and_service( 'localhost', kubernetes_proxy.get_port(), - 'default', - server_pod_name, - docker_image_name, - [server_port], # Port that should be exposed on the container + 'default', # Use 'default' namespace + stress_server_settings.server_pod_name, + gke_settings.tag_name, + [stress_server_settings.server_port], # Port that should be exposed server_cmd_list, server_arg_list, server_env, - True # Headless = True for server. Since we want DNS records to be greated by GKE + True # Headless = True for server. Since we want DNS records to be created by GKE ) return is_success -def _launch_client(gcp_project_id, docker_image_name, bq_settings, - kubernetes_proxy, num_instances, client_pod_name_prefix, - server_pod_name, server_port): +def _launch_client(gke_settings, stress_server_settings, stress_client_settings, + bq_settings, kubernetes_proxy): """ Launches a configurable number of stress test clients on GKE cluster """ if not kubernetes_proxy.is_started: print 'Kubernetes proxy must be started before calling this function' return False - server_address = '%s.default.svc.cluster.local:%d' % (server_pod_name, - server_port) - #TODO(sree) Make the whole client args configurable - test_cases_str = 'empty_unary:1,large_unary:1' stress_client_arg_list = [ - '--server_addresses=%s' % server_address, - '--test_cases=%s' % test_cases_str, '--num_stubs_per_channel=10' + '--server_addresses=%s' % stress_client_settings.server_addresses, + '--test_cases=%s' % stress_client_settings.test_cases_str, + '--num_stubs_per_channel=%d' % + stress_client_settings.num_stubs_per_channel ] + # This is the wrapper script that is run in the container. This script runs + # the actual stress client client_cmd_list = [ '/var/local/git/grpc/tools/run_tests/stress_test/run_client.py' ] - # run_client.py takes no args. All args are passed as env variables - client_arg_list = [] - # TODO(sree) Make this configurable (and also less frequent) - poll_interval_secs = 30 + # run_client.py takes no args. All args are passed as env variables (see + # client_env) + client_arg_list = [] - metrics_port = 8081 - metrics_server_address = 'localhost:%d' % metrics_port + metrics_server_address = 'localhost:%d' % stress_client_settings.metrics_port metrics_client_arg_list = [ '--metrics_server_address=%s' % metrics_server_address, '--total_only=true' ] + # The parameters to the script run_client.py are injected into the container + # via environment variables client_env = { 'STRESS_TEST_IMAGE_TYPE': 'CLIENT', 'STRESS_TEST_IMAGE': '/var/local/git/grpc/bins/opt/stress_test', @@ -260,27 +302,28 @@ def _launch_client(gcp_project_id, docker_image_name, bq_settings, 'METRICS_CLIENT_IMAGE': '/var/local/git/grpc/bins/opt/metrics_client', 'METRICS_CLIENT_ARGS_STR': ' '.join(metrics_client_arg_list), 'RUN_ID': bq_settings.run_id, - 'POLL_INTERVAL_SECS': str(poll_interval_secs), - 'GCP_PROJECT_ID': gcp_project_id, + 'POLL_INTERVAL_SECS': + str(stress_client_settings.stress_client_poll_interval_secs), + 'GCP_PROJECT_ID': gke_settings.project_id, 'DATASET_ID': bq_settings.dataset_id, 'SUMMARY_TABLE_ID': bq_settings.summary_table_id, 'QPS_TABLE_ID': bq_settings.qps_table_id } - for i in range(1, num_instances + 1): - pod_name = '%s-%d' % (client_pod_name_prefix, i) + for pod_name in stress_client_settings.client_pod_names_list: client_env['POD_NAME'] = pod_name - is_success = _launch_image_on_gke( - 'localhost', + is_success = kubernetes_api.create_pod_and_service( + 'localhost', # Since proxy is running on localhost kubernetes_proxy.get_port(), - 'default', + 'default', # default namespace pod_name, - docker_image_name, - [metrics_port], # Client pods expose metrics port + gke_settings.tag_name, + [stress_client_settings.metrics_port + ], # Client pods expose metrics port client_cmd_list, client_arg_list, client_env, - False # Client is not a headless service. + False # Client is not a headless service ) if not is_success: print 'Error in launching client %s' % pod_name @@ -289,20 +332,17 @@ def _launch_client(gcp_project_id, docker_image_name, bq_settings, return True -def _launch_server_and_client(bq_settings, gcp_project_id, docker_image_name, - num_client_instances): +def _launch_server_and_client(gke_settings, stress_server_settings, + stress_client_settings, bq_settings, + kubernetes_proxy_port): # Start kubernetes proxy - kubernetes_api_port = 9001 - kubernetes_proxy = KubernetesProxy(kubernetes_api_port) + print 'Kubernetes proxy' + kubernetes_proxy = KubernetesProxy(kubernetes_proxy_port) kubernetes_proxy.start() - # num of seconds to wait for the GKE image to start and warmup - image_warmp_secs = 60 - - server_pod_name = 'stress-server' - server_port = 8080 - is_success = _launch_server(gcp_project_id, docker_image_name, bq_settings, - kubernetes_proxy, server_pod_name, server_port) + print 'Launching server..' + is_success = _launch_server(gke_settings, stress_server_settings, bq_settings, + kubernetes_proxy) if not is_success: print 'Error in launching server' return False @@ -310,116 +350,217 @@ def _launch_server_and_client(bq_settings, gcp_project_id, docker_image_name, # Server takes a while to start. # TODO(sree) Use Kubernetes API to query the status of the server instead of # sleeping - print 'Waiting for %s seconds for the server to start...' % image_warmp_secs - time.sleep(image_warmp_secs) + print 'Waiting for %s seconds for the server to start...' % _GKE_IMAGE_WARMUP_WAIT_SECS + time.sleep(_GKE_IMAGE_WARMUP_WAIT_SECS) # Launch client - server_address = '%s.default.svc.cluster.local:%d' % (server_pod_name, - server_port) client_pod_name_prefix = 'stress-client' - is_success = _launch_client(gcp_project_id, docker_image_name, bq_settings, - kubernetes_proxy, num_client_instances, - client_pod_name_prefix, server_pod_name, - server_port) + is_success = _launch_client(gke_settings, stress_server_settings, + stress_client_settings, bq_settings, + kubernetes_proxy) + if not is_success: print 'Error in launching client(s)' return False - print 'Waiting for %s seconds for the client images to start...' % image_warmp_secs - time.sleep(image_warmp_secs) + print 'Waiting for %s seconds for the client images to start...' % _GKE_IMAGE_WARMUP_WAIT_SECS + time.sleep(_GKE_IMAGE_WARMUP_WAIT_SECS) return True -def _delete_server_and_client(num_client_instances): - kubernetes_api_port = 9001 - kubernetes_proxy = KubernetesProxy(kubernetes_api_port) +def _delete_server_and_client(stress_server_settings, stress_client_settings, + kubernetes_proxy_port): + kubernetes_proxy = KubernetesProxy(kubernetes_proxy_port) kubernetes_proxy.start() # Delete clients first - client_pod_names = ['stress-client-%d' % i - for i in range(1, num_client_instances + 1)] - - is_success = _delete_image_on_gke(kubernetes_proxy, client_pod_names) - if not is_success: - return False + is_success = True + for pod_name in stress_client_settings.client_pod_names_list: + is_success = kubernetes_api.delete_pod_and_service( + 'localhost', kubernetes_proxy_port, 'default', pod_name) + if not is_success: + return False # Delete server - server_pod_name = 'stress-server' - return _delete_image_on_gke(kubernetes_proxy, [server_pod_name]) + is_success = kubernetes_api.delete_pod_and_service( + 'localhost', kubernetes_proxy_port, 'default', + stress_server_settings.server_pod_name) + return is_success -def _build_and_push_docker_image(gcp_project_id, docker_image_name, tag_name): - is_success = _build_docker_image(docker_image_name, tag_name) - if not is_success: - return False - return _push_docker_image_to_gke_registry(tag_name) - +def run_test_main(test_settings, gke_settings, stress_server_settings, + stress_client_clients): + is_success = True -# TODO(sree): This is just to test the above APIs. Rewrite this to make -# everything configurable (like image names / number of instances etc) -def run_test(skip_building_image, gcp_project_id, image_name, tag_name, - num_client_instances, poll_interval_secs, total_duration_secs): - if not skip_building_image: - is_success = _build_docker_image(image_name, tag_name) + if test_settings.build_docker_image: + is_success = _build_docker_image(gke_settings.docker_image_name, + gke_settings.tag_name) if not is_success: return False - is_success = _push_docker_image_to_gke_registry(tag_name) + is_success = _push_docker_image_to_gke_registry(gke_settings.tag_name) if not is_success: return False - # == Big Query tables related settings (Common for both server and client) == - # Create a unique id for this run (Note: Using timestamp instead of UUID to # make it easier to deduce the date/time of the run just by looking at the run # run id. This is useful in debugging when looking at records in Biq query) run_id = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') - dataset_id = 'stress_test_%s' % run_id - summary_table_id = 'summary' - qps_table_id = 'qps' - bq_settings = BigQuerySettings(run_id, dataset_id, summary_table_id, - qps_table_id) - - bq_helper = BigQueryHelper(run_id, '', '', gcp_project_id, dataset_id, - summary_table_id, qps_table_id) + dataset_id = '%s_%s' % (_DATASET_ID_PREFIX, run_id) + + # Big Query settings (common for both Stress Server and Client) + bq_settings = BigQuerySettings(run_id, dataset_id, _SUMMARY_TABLE_ID, + _QPS_TABLE_ID) + + bq_helper = BigQueryHelper(run_id, '', '', args.project_id, dataset_id, + _SUMMARY_TABLE_ID, _QPS_TABLE_ID) bq_helper.initialize() - is_success = _launch_server_and_client(bq_settings, gcp_project_id, tag_name, - num_client_instances) - if not is_success: - return False - start_time = datetime.datetime.now() - end_time = start_time + datetime.timedelta(seconds=total_duration_secs) - - while True: - if datetime.datetime.now() > end_time: - print 'Test was run for %d seconds' % total_duration_secs - break - - # Check if either stress server or clients have failed - if bq_helper.check_if_any_tests_failed(): - is_success = False - print 'Some tests failed.' - break - # Things seem to be running fine. Wait until next poll time to check the - # status - print 'Sleeping for %d seconds..' % poll_interval_secs - time.sleep(poll_interval_secs) - - # Print BiqQuery tables - bq_helper.print_summary_records() - bq_helper.print_qps_records() - - _delete_server_and_client(num_client_instances) + try: + is_success = _launch_server_and_client(gke_settings, stress_server_settings, + stress_client_settings, bq_settings, + test_settings.kubernetes_proxy_port) + if not is_success: + return False + + start_time = datetime.datetime.now() + end_time = start_time + datetime.timedelta( + seconds=test_settings.test_duration_secs) + print 'Running the test until %s' % end_time.isoformat() + + while True: + if datetime.datetime.now() > end_time: + print 'Test was run for %d seconds' % test_settings.test_duration_secs + break + + # Check if either stress server or clients have failed + if bq_helper.check_if_any_tests_failed(): + is_success = False + print 'Some tests failed.' + break + + # Things seem to be running fine. Wait until next poll time to check the + # status + print 'Sleeping for %d seconds..' % test_settings.test_poll_interval_secs + time.sleep(test_settings.test_poll_interval_secs) + + # Print BiqQuery tables + bq_helper.print_summary_records() + bq_helper.print_qps_records() + + finally: + # If is_success is False at this point, it means that the stress tests were + # started successfully but failed while running the tests. In this case we + # do should not delete the pods (since they contain all the failure + # information) + if is_success: + _delete_server_and_client(stress_server_settings, stress_client_settings, + test_settings.kubernetes_proxy_port) + return is_success +argp = argparse.ArgumentParser( + description='Launch stress tests in GKE', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +argp.add_argument('--project_id', + required=True, + help='The Google Cloud Platform Project Id') +argp.add_argument('--num_clients', + default=1, + type=int, + help='Number of client instances to start') +argp.add_argument('--docker_image_name', + default=_DEFAULT_DOCKER_IMAGE_NAME, + help='The name of the docker image containing stress client ' + 'and stress servers') +argp.add_argument('--build_docker_image', + dest='build_docker_image', + action='store_true', + help='Build a docker image and push to Google Container ' + 'Registry') +argp.add_argument('--do_not_build_docker_image', + dest='build_docker_image', + action='store_false', + help='Do not build and push docker image to Google Container ' + 'Registry') +argp.set_defaults(build_docker_image=True) + +argp.add_argument('--test_poll_interval_secs', + default=_DEFAULT_TEST_POLL_INTERVAL_SECS, + type=int, + help='How frequently should this script should monitor the ' + 'health of stress clients and servers running in the GKE ' + 'cluster') +argp.add_argument('--test_duration_secs', + default=_DEFAULT_TEST_DURATION_SECS, + type=int, + help='How long should this test be run') +argp.add_argument('--kubernetes_proxy_port', + default=_DEFAULT_KUBERNETES_PROXY_PORT, + type=int, + help='The port on which the kubernetes proxy (on localhost)' + ' is started') +argp.add_argument('--stress_server_port', + default=_DEFAULT_STRESS_SERVER_PORT, + type=int, + help='The port on which the stress server (in GKE ' + 'containers) listens') +argp.add_argument('--stress_client_metrics_port', + default=_DEFAULT_METRICS_PORT, + type=int, + help='The port on which the stress clients (in GKE ' + 'containers) expose metrics') +argp.add_argument('--stress_client_poll_interval_secs', + default=_DEFAULT_STRESS_CLIENT_POLL_INTERVAL_SECS, + type=int, + help='How frequently should the stress client wrapper script' + ' running inside GKE should monitor health of the actual ' + ' stress client process and upload the metrics to BigQuery') +argp.add_argument('--stress_client_metrics_collection_interval_secs', + default=_DEFAULT_METRICS_COLLECTION_INTERVAL_SECS, + type=int, + help='How frequently should metrics be collected in-memory on' + ' the stress clients (running inside GKE containers). Note ' + 'that this is NOT the same as the upload-to-BigQuery ' + 'frequency. The metrics upload frequency is controlled by the' + ' --stress_client_poll_interval_secs flag') +argp.add_argument('--stress_client_num_channels_per_server', + default=_DEFAULT_NUM_CHANNELS_PER_SERVER, + type=int, + help='The number of channels created to each server from a ' + 'stress client') +argp.add_argument('--stress_client_num_stubs_per_channel', + default=_DEFAULT_NUM_STUBS_PER_CHANNEL, + type=int, + help='The number of stubs created per channel. This number ' + 'indicates the max number of RPCs that can be made in ' + 'parallel on each channel at any given time') +argp.add_argument('--stress_client_test_cases', + default=_DEFAULT_TEST_CASES_STR, + help='List of test cases (with weights) to be executed by the' + ' stress test client. The list is in the following format:\n' + ' ..\n' + ' (Note: The weights do not have to add up to 100)') + if __name__ == '__main__': - image_name = 'grpc_stress_test' - gcp_project_id = 'sree-gce' - tag_name = 'gcr.io/%s/%s' % (gcp_project_id, image_name) - num_client_instances = 3 - poll_interval_secs = 10 - test_duration_secs = 150 - run_test(True, gcp_project_id, image_name, tag_name, num_client_instances, - poll_interval_secs, test_duration_secs) + args = argp.parse_args() + + test_settings = TestSettings( + args.build_docker_image, args.test_poll_interval_secs, + args.test_duration_secs, args.kubernetes_proxy_port) + + gke_settings = GkeSettings(args.project_id, args.docker_image_name) + + stress_server_settings = StressServerSettings(_SERVER_POD_NAME, + args.stress_server_port) + stress_client_settings = StressClientSettings( + args.num_clients, _CLIENT_POD_NAME_PREFIX, _SERVER_POD_NAME, + args.stress_server_port, args.stress_client_metrics_port, + args.stress_client_metrics_collection_interval_secs, + args.stress_client_poll_interval_secs, + args.stress_client_num_channels_per_server, + args.stress_client_num_stubs_per_channel, args.stress_client_test_cases) + + run_test_main(test_settings, gke_settings, stress_server_settings, + stress_client_settings) -- cgit v1.2.3 From da25fdb8826d59c30a851e4e936b09e450247301 Mon Sep 17 00:00:00 2001 From: Sree Kuchibhotla Date: Fri, 26 Feb 2016 13:41:06 -0800 Subject: Address code review comments --- tools/gke/kubernetes_api.py | 4 ++-- tools/gke/run_stress_tests_on_gke.py | 18 ++++-------------- tools/jenkins/build_interop_stress_image.sh | 3 +++ tools/run_tests/stress_test/run_client.py | 1 - 4 files changed, 9 insertions(+), 17 deletions(-) (limited to 'tools') diff --git a/tools/gke/kubernetes_api.py b/tools/gke/kubernetes_api.py index e1017e9da6..2d3f771e93 100755 --- a/tools/gke/kubernetes_api.py +++ b/tools/gke/kubernetes_api.py @@ -228,7 +228,7 @@ def delete_pod(kube_host, kube_port, namespace, pod_name): def create_pod_and_service(kube_host, kube_port, namespace, pod_name, image_name, container_port_list, cmd_list, arg_list, env_dict, is_headless_service): - """A simple helper function that creates a pod and a service (if pod creation was successful).""" + """A helper function that creates a pod and a service (if pod creation was successful).""" is_success = create_pod(kube_host, kube_port, namespace, pod_name, image_name, container_port_list, cmd_list, arg_list, env_dict) if not is_success: @@ -253,7 +253,7 @@ def create_pod_and_service(kube_host, kube_port, namespace, pod_name, def delete_pod_and_service(kube_host, kube_port, namespace, pod_name): - """ A simple helper function that calls delete_pod and delete_service """ + """ A helper function that calls delete_pod and delete_service """ is_success = delete_pod(kube_host, kube_port, namespace, pod_name) if not is_success: print 'Error in deleting pod %s' % pod_name diff --git a/tools/gke/run_stress_tests_on_gke.py b/tools/gke/run_stress_tests_on_gke.py index cf0ef595a6..d126e3db43 100755 --- a/tools/gke/run_stress_tests_on_gke.py +++ b/tools/gke/run_stress_tests_on_gke.py @@ -178,28 +178,19 @@ class StressClientSettings: def _build_docker_image(image_name, tag_name): - """ Build the docker image and add a tag """ + """ Build the docker image and add tag it to the GKE repository """ print 'Building docker image: %s' % image_name os.environ['INTEROP_IMAGE'] = image_name + os.environ['INTEROP_IMAGE_REPOSITORY_TAG'] = tag_name # Note that 'BASE_NAME' HAS to be 'grpc_interop_stress_cxx' since the script # build_interop_stress_image.sh invokes the following script: # tools/dockerfile/$BASE_NAME/build_interop_stress.sh os.environ['BASE_NAME'] = 'grpc_interop_stress_cxx' cmd = ['tools/jenkins/build_interop_stress_image.sh'] - p = subprocess.Popen(args=cmd) - retcode = p.wait() + retcode = subprocess.call(args=cmd) if retcode != 0: print 'Error in building docker image' return False - - print 'Adding an additional tag %s to the image %s' % (tag_name, image_name) - cmd = ['docker', 'tag', '-f', image_name, tag_name] - p = subprocess.Popen(args=cmd) - retcode = p.wait() - if retcode != 0: - print 'Error in creating the tag %s for %s' % (tag_name, image_name) - return False - return True @@ -207,8 +198,7 @@ def _push_docker_image_to_gke_registry(docker_tag_name): """Executes 'gcloud docker push ' to push the image to GKE registry""" cmd = ['gcloud', 'docker', 'push', docker_tag_name] print 'Pushing %s to GKE registry..' % docker_tag_name - p = subprocess.Popen(args=cmd) - retcode = p.wait() + retcode = subprocess.call(args=cmd) if retcode != 0: print 'Error in pushing docker image %s to the GKE registry' % docker_tag_name return False diff --git a/tools/jenkins/build_interop_stress_image.sh b/tools/jenkins/build_interop_stress_image.sh index 92f2dab5e3..4c8e998a8a 100755 --- a/tools/jenkins/build_interop_stress_image.sh +++ b/tools/jenkins/build_interop_stress_image.sh @@ -35,6 +35,8 @@ set -x # Params: # INTEROP_IMAGE - name of tag of the final interop image +# INTEROP_IMAGE_TAG - Optional. If set, the created image will be tagged using +# the command: 'docker tag $INTEROP_IMAGE $INTEROP_IMAGE_REPOSITORY_TAG' # BASE_NAME - base name used to locate the base Dockerfile and build script # TTY_FLAG - optional -t flag to make docker allocate tty # BUILD_INTEROP_DOCKER_EXTRA_ARGS - optional args to be passed to the @@ -77,6 +79,7 @@ CONTAINER_NAME="build_${BASE_NAME}_$(uuidgen)" $BASE_IMAGE \ bash -l /var/local/jenkins/grpc/tools/dockerfile/$BASE_NAME/build_interop_stress.sh \ && docker commit $CONTAINER_NAME $INTEROP_IMAGE \ + && ( if [ -n $INTEROP_IMAGE_REPOSITORY_TAG ]; then docker tag $INTEROP_IMAGE $INTEROP_IMAGE_REPOSITORY_TAG ; fi ) \ && echo "Successfully built image $INTEROP_IMAGE") EXITCODE=$? diff --git a/tools/run_tests/stress_test/run_client.py b/tools/run_tests/stress_test/run_client.py index 33958bce49..0fa1bf1cb9 100755 --- a/tools/run_tests/stress_test/run_client.py +++ b/tools/run_tests/stress_test/run_client.py @@ -142,7 +142,6 @@ def run_client(): # Check if stress_client is still running. If so, collect metrics and upload # to BigQuery status table if stress_p.poll() is not None: - # TODO(sree) Upload completion status to BigQuery end_time = datetime.datetime.now().isoformat() event_type = EventType.SUCCESS details = 'End time: %s' % end_time -- cgit v1.2.3 From aadb910524a62a816f334e1d5b1c74ee5d7f974c Mon Sep 17 00:00:00 2001 From: Sree Kuchibhotla Date: Fri, 26 Feb 2016 16:47:47 -0800 Subject: Change directory structure for the scripts (remove tools/big_query and tools/gke directies and instead create tools/gcp). Move scripts around in to the appropriate directories --- tools/big_query/big_query_utils.py | 140 ------ tools/gcp/stress_test/run_client.py | 187 +++++++ tools/gcp/stress_test/run_server.py | 120 +++++ tools/gcp/stress_test/stress_test_utils.py | 197 ++++++++ tools/gcp/utils/big_query_utils.py | 140 ++++++ tools/gcp/utils/kubernetes_api.py | 269 ++++++++++ tools/gke/kubernetes_api.py | 269 ---------- tools/gke/run_stress_tests_on_gke.py | 556 --------------------- tools/jenkins/build_interop_stress_image.sh | 2 +- tools/run_tests/stress_test/run_client.py | 187 ------- tools/run_tests/stress_test/run_server.py | 120 ----- .../stress_test/run_stress_tests_on_gke.py | 556 +++++++++++++++++++++ tools/run_tests/stress_test/stress_test_utils.py | 197 -------- 13 files changed, 1470 insertions(+), 1470 deletions(-) delete mode 100755 tools/big_query/big_query_utils.py create mode 100755 tools/gcp/stress_test/run_client.py create mode 100755 tools/gcp/stress_test/run_server.py create mode 100755 tools/gcp/stress_test/stress_test_utils.py create mode 100755 tools/gcp/utils/big_query_utils.py create mode 100755 tools/gcp/utils/kubernetes_api.py delete mode 100755 tools/gke/kubernetes_api.py delete mode 100755 tools/gke/run_stress_tests_on_gke.py delete mode 100755 tools/run_tests/stress_test/run_client.py delete mode 100755 tools/run_tests/stress_test/run_server.py create mode 100755 tools/run_tests/stress_test/run_stress_tests_on_gke.py delete mode 100755 tools/run_tests/stress_test/stress_test_utils.py (limited to 'tools') diff --git a/tools/big_query/big_query_utils.py b/tools/big_query/big_query_utils.py deleted file mode 100755 index e2379fd1aa..0000000000 --- a/tools/big_query/big_query_utils.py +++ /dev/null @@ -1,140 +0,0 @@ -#!/usr/bin/env python2.7 -# Copyright 2015-2016 Google Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following disclaimer -# in the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Google Inc. nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import argparse -import json -import uuid -import httplib2 - -from apiclient import discovery -from apiclient.errors import HttpError -from oauth2client.client import GoogleCredentials - -NUM_RETRIES = 3 - - -def create_big_query(): - """Authenticates with cloud platform and gets a BiqQuery service object - """ - creds = GoogleCredentials.get_application_default() - return discovery.build('bigquery', 'v2', credentials=creds) - - -def create_dataset(biq_query, project_id, dataset_id): - is_success = True - body = { - 'datasetReference': { - 'projectId': project_id, - 'datasetId': dataset_id - } - } - - try: - dataset_req = biq_query.datasets().insert(projectId=project_id, body=body) - dataset_req.execute(num_retries=NUM_RETRIES) - except HttpError as http_error: - if http_error.resp.status == 409: - print 'Warning: The dataset %s already exists' % dataset_id - else: - # Note: For more debugging info, print "http_error.content" - print 'Error in creating dataset: %s. Err: %s' % (dataset_id, http_error) - is_success = False - return is_success - - -def create_table(big_query, project_id, dataset_id, table_id, table_schema, - description): - is_success = True - - body = { - 'description': description, - 'schema': { - 'fields': [{ - 'name': field_name, - 'type': field_type, - 'description': field_description - } for (field_name, field_type, field_description) in table_schema] - }, - 'tableReference': { - 'datasetId': dataset_id, - 'projectId': project_id, - 'tableId': table_id - } - } - - try: - table_req = big_query.tables().insert(projectId=project_id, - datasetId=dataset_id, - body=body) - res = table_req.execute(num_retries=NUM_RETRIES) - print 'Successfully created %s "%s"' % (res['kind'], res['id']) - except HttpError as http_error: - if http_error.resp.status == 409: - print 'Warning: Table %s already exists' % table_id - else: - print 'Error in creating table: %s. Err: %s' % (table_id, http_error) - is_success = False - return is_success - - -def insert_rows(big_query, project_id, dataset_id, table_id, rows_list): - is_success = True - body = {'rows': rows_list} - try: - insert_req = big_query.tabledata().insertAll(projectId=project_id, - datasetId=dataset_id, - tableId=table_id, - body=body) - print body - res = insert_req.execute(num_retries=NUM_RETRIES) - print res - except HttpError as http_error: - print 'Error in inserting rows in the table %s' % table_id - is_success = False - return is_success - - -def sync_query_job(big_query, project_id, query, timeout=5000): - query_data = {'query': query, 'timeoutMs': timeout} - query_job = None - try: - query_job = big_query.jobs().query( - projectId=project_id, - body=query_data).execute(num_retries=NUM_RETRIES) - except HttpError as http_error: - print 'Query execute job failed with error: %s' % http_error - print http_error.content - return query_job - - # List of (column name, column type, description) tuples -def make_row(unique_row_id, row_values_dict): - """row_values_dict is a dictionary of column name and column value. - """ - return {'insertId': unique_row_id, 'json': row_values_dict} diff --git a/tools/gcp/stress_test/run_client.py b/tools/gcp/stress_test/run_client.py new file mode 100755 index 0000000000..0fa1bf1cb9 --- /dev/null +++ b/tools/gcp/stress_test/run_client.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python2.7 +# Copyright 2015-2016, Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import datetime +import os +import re +import select +import subprocess +import sys +import time + +from stress_test_utils import EventType +from stress_test_utils import BigQueryHelper + + +# TODO (sree): Write a python grpc client to directly query the metrics instead +# of calling metrics_client +def _get_qps(metrics_cmd): + qps = 0 + try: + # Note: gpr_log() writes even non-error messages to stderr stream. So it is + # important that we set stderr=subprocess.STDOUT + p = subprocess.Popen(args=metrics_cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + retcode = p.wait() + (out_str, err_str) = p.communicate() + if retcode != 0: + print 'Error in reading metrics information' + print 'Output: ', out_str + else: + # The overall qps is printed at the end of the line + m = re.search('\d+$', out_str) + qps = int(m.group()) if m else 0 + except Exception as ex: + print 'Exception while reading metrics information: ' + str(ex) + return qps + + +def run_client(): + """This is a wrapper around the stress test client and performs the following: + 1) Create the following two tables in Big Query: + (i) Summary table: To record events like the test started, completed + successfully or failed + (ii) Qps table: To periodically record the QPS sent by this client + 2) Start the stress test client and add a row in the Big Query summary + table + 3) Once every few seconds (as specificed by the poll_interval_secs) poll + the status of the stress test client process and perform the + following: + 3.1) If the process is still running, get the current qps by invoking + the metrics client program and add a row in the Big Query + Qps table. Sleep for a duration specified by poll_interval_secs + 3.2) If the process exited successfully, add a row in the Big Query + Summary table and exit + 3.3) If the process failed, add a row in Big Query summary table and + wait forever. + NOTE: This script typically runs inside a GKE pod which means + that the pod gets destroyed when the script exits. However, in + case the stress test client fails, we would not want the pod to + be destroyed (since we might want to connect to the pod for + examining logs). This is the reason why the script waits forever + in case of failures + """ + env = dict(os.environ) + image_type = env['STRESS_TEST_IMAGE_TYPE'] + image_name = env['STRESS_TEST_IMAGE'] + args_str = env['STRESS_TEST_ARGS_STR'] + metrics_client_image = env['METRICS_CLIENT_IMAGE'] + metrics_client_args_str = env['METRICS_CLIENT_ARGS_STR'] + run_id = env['RUN_ID'] + pod_name = env['POD_NAME'] + logfile_name = env.get('LOGFILE_NAME') + poll_interval_secs = float(env['POLL_INTERVAL_SECS']) + project_id = env['GCP_PROJECT_ID'] + dataset_id = env['DATASET_ID'] + summary_table_id = env['SUMMARY_TABLE_ID'] + qps_table_id = env['QPS_TABLE_ID'] + + bq_helper = BigQueryHelper(run_id, image_type, pod_name, project_id, + dataset_id, summary_table_id, qps_table_id) + bq_helper.initialize() + + # Create BigQuery Dataset and Tables: Summary Table and Metrics Table + if not bq_helper.setup_tables(): + print 'Error in creating BigQuery tables' + return + + start_time = datetime.datetime.now() + + logfile = None + details = 'Logging to stdout' + if logfile_name is not None: + print 'Opening logfile: %s ...' % logfile_name + details = 'Logfile: %s' % logfile_name + logfile = open(logfile_name, 'w') + + # Update status that the test is starting (in the status table) + bq_helper.insert_summary_row(EventType.STARTING, details) + + metrics_cmd = [metrics_client_image + ] + [x for x in metrics_client_args_str.split()] + stress_cmd = [image_name] + [x for x in args_str.split()] + + print 'Launching process %s ...' % stress_cmd + stress_p = subprocess.Popen(args=stress_cmd, + stdout=logfile, + stderr=subprocess.STDOUT) + + qps_history = [1, 1, 1] # Maintain the last 3 qps readings + qps_history_idx = 0 # Index into the qps_history list + + is_error = False + while True: + # Check if stress_client is still running. If so, collect metrics and upload + # to BigQuery status table + if stress_p.poll() is not None: + end_time = datetime.datetime.now().isoformat() + event_type = EventType.SUCCESS + details = 'End time: %s' % end_time + if stress_p.returncode != 0: + event_type = EventType.FAILURE + details = 'Return code = %d. End time: %s' % (stress_p.returncode, + end_time) + is_error = True + bq_helper.insert_summary_row(event_type, details) + print details + break + + # Stress client still running. Get metrics + qps = _get_qps(metrics_cmd) + qps_recorded_at = datetime.datetime.now().isoformat() + print 'qps: %d at %s' % (qps, qps_recorded_at) + + # If QPS has been zero for the last 3 iterations, flag it as error and exit + qps_history[qps_history_idx] = qps + qps_history_idx = (qps_history_idx + 1) % len(qps_history) + if sum(qps_history) == 0: + details = 'QPS has been zero for the last %d seconds - as of : %s' % ( + poll_interval_secs * 3, qps_recorded_at) + is_error = True + bq_helper.insert_summary_row(EventType.FAILURE, details) + print details + break + + # Upload qps metrics to BiqQuery + bq_helper.insert_qps_row(qps, qps_recorded_at) + + time.sleep(poll_interval_secs) + + if is_error: + print 'Waiting indefinitely..' + select.select([], [], []) + + print 'Completed' + return + + +if __name__ == '__main__': + run_client() diff --git a/tools/gcp/stress_test/run_server.py b/tools/gcp/stress_test/run_server.py new file mode 100755 index 0000000000..64322f6100 --- /dev/null +++ b/tools/gcp/stress_test/run_server.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python2.7 +# Copyright 2015-2016, Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import datetime +import os +import select +import subprocess +import sys +import time + +from stress_test_utils import BigQueryHelper +from stress_test_utils import EventType + + +def run_server(): + """This is a wrapper around the interop server and performs the following: + 1) Create a 'Summary table' in Big Query to record events like the server + started, completed successfully or failed. NOTE: This also creates + another table called the QPS table which is currently NOT needed on the + server (it is needed on the stress test clients) + 2) Start the server process and add a row in Big Query summary table + 3) Wait for the server process to terminate. The server process does not + terminate unless there is an error. + If the server process terminated with a failure, add a row in Big Query + and wait forever. + NOTE: This script typically runs inside a GKE pod which means that the + pod gets destroyed when the script exits. However, in case the server + process fails, we would not want the pod to be destroyed (since we + might want to connect to the pod for examining logs). This is the + reason why the script waits forever in case of failures. + """ + + # Read the parameters from environment variables + env = dict(os.environ) + + run_id = env['RUN_ID'] # The unique run id for this test + image_type = env['STRESS_TEST_IMAGE_TYPE'] + image_name = env['STRESS_TEST_IMAGE'] + args_str = env['STRESS_TEST_ARGS_STR'] + pod_name = env['POD_NAME'] + project_id = env['GCP_PROJECT_ID'] + dataset_id = env['DATASET_ID'] + summary_table_id = env['SUMMARY_TABLE_ID'] + qps_table_id = env['QPS_TABLE_ID'] + + logfile_name = env.get('LOGFILE_NAME') + + print('pod_name: %s, project_id: %s, run_id: %s, dataset_id: %s, ' + 'summary_table_id: %s, qps_table_id: %s') % ( + pod_name, project_id, run_id, dataset_id, summary_table_id, + qps_table_id) + + bq_helper = BigQueryHelper(run_id, image_type, pod_name, project_id, + dataset_id, summary_table_id, qps_table_id) + bq_helper.initialize() + + # Create BigQuery Dataset and Tables: Summary Table and Metrics Table + if not bq_helper.setup_tables(): + print 'Error in creating BigQuery tables' + return + + start_time = datetime.datetime.now() + + logfile = None + details = 'Logging to stdout' + if logfile_name is not None: + print 'Opening log file: ', logfile_name + logfile = open(logfile_name, 'w') + details = 'Logfile: %s' % logfile_name + + # Update status that the test is starting (in the status table) + bq_helper.insert_summary_row(EventType.STARTING, details) + + stress_cmd = [image_name] + [x for x in args_str.split()] + + print 'Launching process %s ...' % stress_cmd + stress_p = subprocess.Popen(args=stress_cmd, + stdout=logfile, + stderr=subprocess.STDOUT) + + returncode = stress_p.wait() + if returncode != 0: + end_time = datetime.datetime.now().isoformat() + event_type = EventType.FAILURE + details = 'Returncode: %d; End time: %s' % (returncode, end_time) + bq_helper.insert_summary_row(event_type, details) + print 'Waiting indefinitely..' + select.select([], [], []) + return returncode + + +if __name__ == '__main__': + run_server() diff --git a/tools/gcp/stress_test/stress_test_utils.py b/tools/gcp/stress_test/stress_test_utils.py new file mode 100755 index 0000000000..c4b437e345 --- /dev/null +++ b/tools/gcp/stress_test/stress_test_utils.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python2.7 +# Copyright 2015-2016, Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import datetime +import json +import os +import re +import select +import subprocess +import sys +import time + +# Import big_query_utils module +bq_utils_dir = os.path.abspath(os.path.join( + os.path.dirname(__file__), '../utils')) +sys.path.append(bq_utils_dir) +import big_query_utils as bq_utils + + +class EventType: + STARTING = 'STARTING' + SUCCESS = 'SUCCESS' + FAILURE = 'FAILURE' + + +class BigQueryHelper: + """Helper class for the stress test wrappers to interact with BigQuery. + """ + + def __init__(self, run_id, image_type, pod_name, project_id, dataset_id, + summary_table_id, qps_table_id): + self.run_id = run_id + self.image_type = image_type + self.pod_name = pod_name + self.project_id = project_id + self.dataset_id = dataset_id + self.summary_table_id = summary_table_id + self.qps_table_id = qps_table_id + + def initialize(self): + self.bq = bq_utils.create_big_query() + + def setup_tables(self): + return bq_utils.create_dataset(self.bq, self.project_id, self.dataset_id) \ + and self.__create_summary_table() \ + and self.__create_qps_table() + + def insert_summary_row(self, event_type, details): + row_values_dict = { + 'run_id': self.run_id, + 'image_type': self.image_type, + 'pod_name': self.pod_name, + 'event_date': datetime.datetime.now().isoformat(), + 'event_type': event_type, + 'details': details + } + # row_unique_id is something that uniquely identifies the row (BigQuery uses + # it for duplicate detection). + row_unique_id = '%s_%s_%s' % (self.run_id, self.pod_name, event_type) + row = bq_utils.make_row(row_unique_id, row_values_dict) + return bq_utils.insert_rows(self.bq, self.project_id, self.dataset_id, + self.summary_table_id, [row]) + + def insert_qps_row(self, qps, recorded_at): + row_values_dict = { + 'run_id': self.run_id, + 'pod_name': self.pod_name, + 'recorded_at': recorded_at, + 'qps': qps + } + + # row_unique_id is something that uniquely identifies the row (BigQuery uses + # it for duplicate detection). + row_unique_id = '%s_%s_%s' % (self.run_id, self.pod_name, recorded_at) + row = bq_utils.make_row(row_unique_id, row_values_dict) + return bq_utils.insert_rows(self.bq, self.project_id, self.dataset_id, + self.qps_table_id, [row]) + + def check_if_any_tests_failed(self, num_query_retries=3): + query = ('SELECT event_type FROM %s.%s WHERE run_id = \'%s\' AND ' + 'event_type="%s"') % (self.dataset_id, self.summary_table_id, + self.run_id, EventType.FAILURE) + query_job = bq_utils.sync_query_job(self.bq, self.project_id, query) + page = self.bq.jobs().getQueryResults(**query_job['jobReference']).execute( + num_retries=num_query_retries) + num_failures = int(page['totalRows']) + print 'num rows: ', num_failures + return num_failures > 0 + + def print_summary_records(self, num_query_retries=3): + line = '-' * 120 + print line + print 'Summary records' + print 'Run Id: ', self.run_id + print 'Dataset Id: ', self.dataset_id + print line + query = ('SELECT pod_name, image_type, event_type, event_date, details' + ' FROM %s.%s WHERE run_id = \'%s\' ORDER by event_date;') % ( + self.dataset_id, self.summary_table_id, self.run_id) + query_job = bq_utils.sync_query_job(self.bq, self.project_id, query) + + print '{:<25} {:<12} {:<12} {:<30} {}'.format( + 'Pod name', 'Image type', 'Event type', 'Date', 'Details') + print line + page_token = None + while True: + page = self.bq.jobs().getQueryResults( + pageToken=page_token, + **query_job['jobReference']).execute(num_retries=num_query_retries) + rows = page.get('rows', []) + for row in rows: + print '{:<25} {:<12} {:<12} {:<30} {}'.format( + row['f'][0]['v'], row['f'][1]['v'], row['f'][2]['v'], + row['f'][3]['v'], row['f'][4]['v']) + page_token = page.get('pageToken') + if not page_token: + break + + def print_qps_records(self, num_query_retries=3): + line = '-' * 80 + print line + print 'QPS Summary' + print 'Run Id: ', self.run_id + print 'Dataset Id: ', self.dataset_id + print line + query = ( + 'SELECT pod_name, recorded_at, qps FROM %s.%s WHERE run_id = \'%s\' ' + 'ORDER by recorded_at;') % (self.dataset_id, self.qps_table_id, + self.run_id) + query_job = bq_utils.sync_query_job(self.bq, self.project_id, query) + print '{:<25} {:30} {}'.format('Pod name', 'Recorded at', 'Qps') + print line + page_token = None + while True: + page = self.bq.jobs().getQueryResults( + pageToken=page_token, + **query_job['jobReference']).execute(num_retries=num_query_retries) + rows = page.get('rows', []) + for row in rows: + print '{:<25} {:30} {}'.format(row['f'][0]['v'], row['f'][1]['v'], + row['f'][2]['v']) + page_token = page.get('pageToken') + if not page_token: + break + + def __create_summary_table(self): + summary_table_schema = [ + ('run_id', 'STRING', 'Test run id'), + ('image_type', 'STRING', 'Client or Server?'), + ('pod_name', 'STRING', 'GKE pod hosting this image'), + ('event_date', 'STRING', 'The date of this event'), + ('event_type', 'STRING', 'STARTED/SUCCESS/FAILURE'), + ('details', 'STRING', 'Any other relevant details') + ] + desc = ('The table that contains START/SUCCESS/FAILURE events for ' + ' the stress test clients and servers') + return bq_utils.create_table(self.bq, self.project_id, self.dataset_id, + self.summary_table_id, summary_table_schema, + desc) + + def __create_qps_table(self): + qps_table_schema = [ + ('run_id', 'STRING', 'Test run id'), + ('pod_name', 'STRING', 'GKE pod hosting this image'), + ('recorded_at', 'STRING', 'Metrics recorded at time'), + ('qps', 'INTEGER', 'Queries per second') + ] + desc = 'The table that cointains the qps recorded at various intervals' + return bq_utils.create_table(self.bq, self.project_id, self.dataset_id, + self.qps_table_id, qps_table_schema, desc) diff --git a/tools/gcp/utils/big_query_utils.py b/tools/gcp/utils/big_query_utils.py new file mode 100755 index 0000000000..e2379fd1aa --- /dev/null +++ b/tools/gcp/utils/big_query_utils.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python2.7 +# Copyright 2015-2016 Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import json +import uuid +import httplib2 + +from apiclient import discovery +from apiclient.errors import HttpError +from oauth2client.client import GoogleCredentials + +NUM_RETRIES = 3 + + +def create_big_query(): + """Authenticates with cloud platform and gets a BiqQuery service object + """ + creds = GoogleCredentials.get_application_default() + return discovery.build('bigquery', 'v2', credentials=creds) + + +def create_dataset(biq_query, project_id, dataset_id): + is_success = True + body = { + 'datasetReference': { + 'projectId': project_id, + 'datasetId': dataset_id + } + } + + try: + dataset_req = biq_query.datasets().insert(projectId=project_id, body=body) + dataset_req.execute(num_retries=NUM_RETRIES) + except HttpError as http_error: + if http_error.resp.status == 409: + print 'Warning: The dataset %s already exists' % dataset_id + else: + # Note: For more debugging info, print "http_error.content" + print 'Error in creating dataset: %s. Err: %s' % (dataset_id, http_error) + is_success = False + return is_success + + +def create_table(big_query, project_id, dataset_id, table_id, table_schema, + description): + is_success = True + + body = { + 'description': description, + 'schema': { + 'fields': [{ + 'name': field_name, + 'type': field_type, + 'description': field_description + } for (field_name, field_type, field_description) in table_schema] + }, + 'tableReference': { + 'datasetId': dataset_id, + 'projectId': project_id, + 'tableId': table_id + } + } + + try: + table_req = big_query.tables().insert(projectId=project_id, + datasetId=dataset_id, + body=body) + res = table_req.execute(num_retries=NUM_RETRIES) + print 'Successfully created %s "%s"' % (res['kind'], res['id']) + except HttpError as http_error: + if http_error.resp.status == 409: + print 'Warning: Table %s already exists' % table_id + else: + print 'Error in creating table: %s. Err: %s' % (table_id, http_error) + is_success = False + return is_success + + +def insert_rows(big_query, project_id, dataset_id, table_id, rows_list): + is_success = True + body = {'rows': rows_list} + try: + insert_req = big_query.tabledata().insertAll(projectId=project_id, + datasetId=dataset_id, + tableId=table_id, + body=body) + print body + res = insert_req.execute(num_retries=NUM_RETRIES) + print res + except HttpError as http_error: + print 'Error in inserting rows in the table %s' % table_id + is_success = False + return is_success + + +def sync_query_job(big_query, project_id, query, timeout=5000): + query_data = {'query': query, 'timeoutMs': timeout} + query_job = None + try: + query_job = big_query.jobs().query( + projectId=project_id, + body=query_data).execute(num_retries=NUM_RETRIES) + except HttpError as http_error: + print 'Query execute job failed with error: %s' % http_error + print http_error.content + return query_job + + # List of (column name, column type, description) tuples +def make_row(unique_row_id, row_values_dict): + """row_values_dict is a dictionary of column name and column value. + """ + return {'insertId': unique_row_id, 'json': row_values_dict} diff --git a/tools/gcp/utils/kubernetes_api.py b/tools/gcp/utils/kubernetes_api.py new file mode 100755 index 0000000000..2d3f771e93 --- /dev/null +++ b/tools/gcp/utils/kubernetes_api.py @@ -0,0 +1,269 @@ +#!/usr/bin/env python2.7 +# Copyright 2015-2016 Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import requests +import json + +_REQUEST_TIMEOUT_SECS = 10 + + +def _make_pod_config(pod_name, image_name, container_port_list, cmd_list, + arg_list, env_dict): + """Creates a string containing the Pod defintion as required by the Kubernetes API""" + body = { + 'kind': 'Pod', + 'apiVersion': 'v1', + 'metadata': { + 'name': pod_name, + 'labels': {'name': pod_name} + }, + 'spec': { + 'containers': [ + { + 'name': pod_name, + 'image': image_name, + 'ports': [{'containerPort': port, + 'protocol': 'TCP'} + for port in container_port_list], + 'imagePullPolicy': 'Always' + } + ] + } + } + + env_list = [{'name': k, 'value': v} for (k, v) in env_dict.iteritems()] + if len(env_list) > 0: + body['spec']['containers'][0]['env'] = env_list + + # Add the 'Command' and 'Args' attributes if they are passed. + # Note: + # - 'Command' overrides the ENTRYPOINT in the Docker Image + # - 'Args' override the CMD in Docker image (yes, it is confusing!) + if len(cmd_list) > 0: + body['spec']['containers'][0]['command'] = cmd_list + if len(arg_list) > 0: + body['spec']['containers'][0]['args'] = arg_list + return json.dumps(body) + + +def _make_service_config(service_name, pod_name, service_port_list, + container_port_list, is_headless): + """Creates a string containing the Service definition as required by the Kubernetes API. + + NOTE: + This creates either a Headless Service or 'LoadBalancer' service depending on + the is_headless parameter. For Headless services, there is no 'type' attribute + and the 'clusterIP' attribute is set to 'None'. Also, if the service is + Headless, Kubernetes creates DNS entries for Pods - i.e creates DNS A-records + mapping the service's name to the Pods' IPs + """ + if len(container_port_list) != len(service_port_list): + print( + 'ERROR: container_port_list and service_port_list must be of same size') + return '' + body = { + 'kind': 'Service', + 'apiVersion': 'v1', + 'metadata': { + 'name': service_name, + 'labels': { + 'name': service_name + } + }, + 'spec': { + 'ports': [], + 'selector': { + 'name': pod_name + } + } + } + # Populate the 'ports' list in the 'spec' section. This maps service ports + # (port numbers that are exposed by Kubernetes) to container ports (i.e port + # numbers that are exposed by your Docker image) + for idx in range(len(container_port_list)): + port_entry = { + 'port': service_port_list[idx], + 'targetPort': container_port_list[idx], + 'protocol': 'TCP' + } + body['spec']['ports'].append(port_entry) + + # Make this either a LoadBalancer service or a headless service depending on + # the is_headless parameter + if is_headless: + body['spec']['clusterIP'] = 'None' + else: + body['spec']['type'] = 'LoadBalancer' + return json.dumps(body) + + +def _print_connection_error(msg): + print('ERROR: Connection failed. Did you remember to run Kubenetes proxy on ' + 'localhost (i.e kubectl proxy --port=) ?. Error: %s' % msg) + + +def _do_post(post_url, api_name, request_body): + """Helper to do HTTP POST. + + Note: + 1) On success, Kubernetes returns a success code of 201(CREATED) not 200(OK) + 2) A response code of 509(CONFLICT) is interpreted as a success code (since + the error is most likely due to the resource already existing). This makes + _do_post() idempotent which is semantically desirable. + """ + is_success = True + try: + r = requests.post(post_url, + data=request_body, + timeout=_REQUEST_TIMEOUT_SECS) + if r.status_code == requests.codes.conflict: + print('WARN: Looks like the resource already exists. Api: %s, url: %s' % + (api_name, post_url)) + elif r.status_code != requests.codes.created: + print('ERROR: %s API returned error. HTTP response: (%d) %s' % + (api_name, r.status_code, r.text)) + is_success = False + except (requests.exceptions.Timeout, + requests.exceptions.ConnectionError) as e: + is_success = False + _print_connection_error(str(e)) + return is_success + + +def _do_delete(del_url, api_name): + """Helper to do HTTP DELETE. + + Note: A response code of 404(NOT_FOUND) is treated as success to keep + _do_delete() idempotent. + """ + is_success = True + try: + r = requests.delete(del_url, timeout=_REQUEST_TIMEOUT_SECS) + if r.status_code == requests.codes.not_found: + print('WARN: The resource does not exist. Api: %s, url: %s' % + (api_name, del_url)) + elif r.status_code != requests.codes.ok: + print('ERROR: %s API returned error. HTTP response: %s' % + (api_name, r.text)) + is_success = False + except (requests.exceptions.Timeout, + requests.exceptions.ConnectionError) as e: + is_success = False + _print_connection_error(str(e)) + return is_success + + +def create_service(kube_host, kube_port, namespace, service_name, pod_name, + service_port_list, container_port_list, is_headless): + """Creates either a Headless Service or a LoadBalancer Service depending + on the is_headless parameter. + """ + post_url = 'http://%s:%d/api/v1/namespaces/%s/services' % ( + kube_host, kube_port, namespace) + request_body = _make_service_config(service_name, pod_name, service_port_list, + container_port_list, is_headless) + return _do_post(post_url, 'Create Service', request_body) + + +def create_pod(kube_host, kube_port, namespace, pod_name, image_name, + container_port_list, cmd_list, arg_list, env_dict): + """Creates a Kubernetes Pod. + + Note that it is generally NOT considered a good practice to directly create + Pods. Typically, the recommendation is to create 'Controllers' to create and + manage Pods' lifecycle. Currently Kubernetes only supports 'Replication + Controller' which creates a configurable number of 'identical Replicas' of + Pods and automatically restarts any Pods in case of failures (for eg: Machine + failures in Kubernetes). This makes it less flexible for our test use cases + where we might want slightly different set of args to each Pod. Hence we + directly create Pods and not care much about Kubernetes failures since those + are very rare. + """ + post_url = 'http://%s:%d/api/v1/namespaces/%s/pods' % (kube_host, kube_port, + namespace) + request_body = _make_pod_config(pod_name, image_name, container_port_list, + cmd_list, arg_list, env_dict) + return _do_post(post_url, 'Create Pod', request_body) + + +def delete_service(kube_host, kube_port, namespace, service_name): + del_url = 'http://%s:%d/api/v1/namespaces/%s/services/%s' % ( + kube_host, kube_port, namespace, service_name) + return _do_delete(del_url, 'Delete Service') + + +def delete_pod(kube_host, kube_port, namespace, pod_name): + del_url = 'http://%s:%d/api/v1/namespaces/%s/pods/%s' % (kube_host, kube_port, + namespace, pod_name) + return _do_delete(del_url, 'Delete Pod') + + +def create_pod_and_service(kube_host, kube_port, namespace, pod_name, + image_name, container_port_list, cmd_list, arg_list, + env_dict, is_headless_service): + """A helper function that creates a pod and a service (if pod creation was successful).""" + is_success = create_pod(kube_host, kube_port, namespace, pod_name, image_name, + container_port_list, cmd_list, arg_list, env_dict) + if not is_success: + print 'Error in creating Pod' + return False + + is_success = create_service( + kube_host, + kube_port, + namespace, + pod_name, # Use pod_name for service + pod_name, + container_port_list, # Service port list same as container port list + container_port_list, + is_headless_service) + if not is_success: + print 'Error in creating Service' + return False + + print 'Successfully created the pod/service %s' % pod_name + return True + + +def delete_pod_and_service(kube_host, kube_port, namespace, pod_name): + """ A helper function that calls delete_pod and delete_service """ + is_success = delete_pod(kube_host, kube_port, namespace, pod_name) + if not is_success: + print 'Error in deleting pod %s' % pod_name + return False + + # Note: service name assumed to the the same as pod name + is_success = delete_service(kube_host, kube_port, namespace, pod_name) + if not is_success: + print 'Error in deleting service %s' % pod_name + return False + + print 'Successfully deleted the Pod/Service: %s' % pod_name + return True diff --git a/tools/gke/kubernetes_api.py b/tools/gke/kubernetes_api.py deleted file mode 100755 index 2d3f771e93..0000000000 --- a/tools/gke/kubernetes_api.py +++ /dev/null @@ -1,269 +0,0 @@ -#!/usr/bin/env python2.7 -# Copyright 2015-2016 Google Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following disclaimer -# in the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Google Inc. nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import requests -import json - -_REQUEST_TIMEOUT_SECS = 10 - - -def _make_pod_config(pod_name, image_name, container_port_list, cmd_list, - arg_list, env_dict): - """Creates a string containing the Pod defintion as required by the Kubernetes API""" - body = { - 'kind': 'Pod', - 'apiVersion': 'v1', - 'metadata': { - 'name': pod_name, - 'labels': {'name': pod_name} - }, - 'spec': { - 'containers': [ - { - 'name': pod_name, - 'image': image_name, - 'ports': [{'containerPort': port, - 'protocol': 'TCP'} - for port in container_port_list], - 'imagePullPolicy': 'Always' - } - ] - } - } - - env_list = [{'name': k, 'value': v} for (k, v) in env_dict.iteritems()] - if len(env_list) > 0: - body['spec']['containers'][0]['env'] = env_list - - # Add the 'Command' and 'Args' attributes if they are passed. - # Note: - # - 'Command' overrides the ENTRYPOINT in the Docker Image - # - 'Args' override the CMD in Docker image (yes, it is confusing!) - if len(cmd_list) > 0: - body['spec']['containers'][0]['command'] = cmd_list - if len(arg_list) > 0: - body['spec']['containers'][0]['args'] = arg_list - return json.dumps(body) - - -def _make_service_config(service_name, pod_name, service_port_list, - container_port_list, is_headless): - """Creates a string containing the Service definition as required by the Kubernetes API. - - NOTE: - This creates either a Headless Service or 'LoadBalancer' service depending on - the is_headless parameter. For Headless services, there is no 'type' attribute - and the 'clusterIP' attribute is set to 'None'. Also, if the service is - Headless, Kubernetes creates DNS entries for Pods - i.e creates DNS A-records - mapping the service's name to the Pods' IPs - """ - if len(container_port_list) != len(service_port_list): - print( - 'ERROR: container_port_list and service_port_list must be of same size') - return '' - body = { - 'kind': 'Service', - 'apiVersion': 'v1', - 'metadata': { - 'name': service_name, - 'labels': { - 'name': service_name - } - }, - 'spec': { - 'ports': [], - 'selector': { - 'name': pod_name - } - } - } - # Populate the 'ports' list in the 'spec' section. This maps service ports - # (port numbers that are exposed by Kubernetes) to container ports (i.e port - # numbers that are exposed by your Docker image) - for idx in range(len(container_port_list)): - port_entry = { - 'port': service_port_list[idx], - 'targetPort': container_port_list[idx], - 'protocol': 'TCP' - } - body['spec']['ports'].append(port_entry) - - # Make this either a LoadBalancer service or a headless service depending on - # the is_headless parameter - if is_headless: - body['spec']['clusterIP'] = 'None' - else: - body['spec']['type'] = 'LoadBalancer' - return json.dumps(body) - - -def _print_connection_error(msg): - print('ERROR: Connection failed. Did you remember to run Kubenetes proxy on ' - 'localhost (i.e kubectl proxy --port=) ?. Error: %s' % msg) - - -def _do_post(post_url, api_name, request_body): - """Helper to do HTTP POST. - - Note: - 1) On success, Kubernetes returns a success code of 201(CREATED) not 200(OK) - 2) A response code of 509(CONFLICT) is interpreted as a success code (since - the error is most likely due to the resource already existing). This makes - _do_post() idempotent which is semantically desirable. - """ - is_success = True - try: - r = requests.post(post_url, - data=request_body, - timeout=_REQUEST_TIMEOUT_SECS) - if r.status_code == requests.codes.conflict: - print('WARN: Looks like the resource already exists. Api: %s, url: %s' % - (api_name, post_url)) - elif r.status_code != requests.codes.created: - print('ERROR: %s API returned error. HTTP response: (%d) %s' % - (api_name, r.status_code, r.text)) - is_success = False - except (requests.exceptions.Timeout, - requests.exceptions.ConnectionError) as e: - is_success = False - _print_connection_error(str(e)) - return is_success - - -def _do_delete(del_url, api_name): - """Helper to do HTTP DELETE. - - Note: A response code of 404(NOT_FOUND) is treated as success to keep - _do_delete() idempotent. - """ - is_success = True - try: - r = requests.delete(del_url, timeout=_REQUEST_TIMEOUT_SECS) - if r.status_code == requests.codes.not_found: - print('WARN: The resource does not exist. Api: %s, url: %s' % - (api_name, del_url)) - elif r.status_code != requests.codes.ok: - print('ERROR: %s API returned error. HTTP response: %s' % - (api_name, r.text)) - is_success = False - except (requests.exceptions.Timeout, - requests.exceptions.ConnectionError) as e: - is_success = False - _print_connection_error(str(e)) - return is_success - - -def create_service(kube_host, kube_port, namespace, service_name, pod_name, - service_port_list, container_port_list, is_headless): - """Creates either a Headless Service or a LoadBalancer Service depending - on the is_headless parameter. - """ - post_url = 'http://%s:%d/api/v1/namespaces/%s/services' % ( - kube_host, kube_port, namespace) - request_body = _make_service_config(service_name, pod_name, service_port_list, - container_port_list, is_headless) - return _do_post(post_url, 'Create Service', request_body) - - -def create_pod(kube_host, kube_port, namespace, pod_name, image_name, - container_port_list, cmd_list, arg_list, env_dict): - """Creates a Kubernetes Pod. - - Note that it is generally NOT considered a good practice to directly create - Pods. Typically, the recommendation is to create 'Controllers' to create and - manage Pods' lifecycle. Currently Kubernetes only supports 'Replication - Controller' which creates a configurable number of 'identical Replicas' of - Pods and automatically restarts any Pods in case of failures (for eg: Machine - failures in Kubernetes). This makes it less flexible for our test use cases - where we might want slightly different set of args to each Pod. Hence we - directly create Pods and not care much about Kubernetes failures since those - are very rare. - """ - post_url = 'http://%s:%d/api/v1/namespaces/%s/pods' % (kube_host, kube_port, - namespace) - request_body = _make_pod_config(pod_name, image_name, container_port_list, - cmd_list, arg_list, env_dict) - return _do_post(post_url, 'Create Pod', request_body) - - -def delete_service(kube_host, kube_port, namespace, service_name): - del_url = 'http://%s:%d/api/v1/namespaces/%s/services/%s' % ( - kube_host, kube_port, namespace, service_name) - return _do_delete(del_url, 'Delete Service') - - -def delete_pod(kube_host, kube_port, namespace, pod_name): - del_url = 'http://%s:%d/api/v1/namespaces/%s/pods/%s' % (kube_host, kube_port, - namespace, pod_name) - return _do_delete(del_url, 'Delete Pod') - - -def create_pod_and_service(kube_host, kube_port, namespace, pod_name, - image_name, container_port_list, cmd_list, arg_list, - env_dict, is_headless_service): - """A helper function that creates a pod and a service (if pod creation was successful).""" - is_success = create_pod(kube_host, kube_port, namespace, pod_name, image_name, - container_port_list, cmd_list, arg_list, env_dict) - if not is_success: - print 'Error in creating Pod' - return False - - is_success = create_service( - kube_host, - kube_port, - namespace, - pod_name, # Use pod_name for service - pod_name, - container_port_list, # Service port list same as container port list - container_port_list, - is_headless_service) - if not is_success: - print 'Error in creating Service' - return False - - print 'Successfully created the pod/service %s' % pod_name - return True - - -def delete_pod_and_service(kube_host, kube_port, namespace, pod_name): - """ A helper function that calls delete_pod and delete_service """ - is_success = delete_pod(kube_host, kube_port, namespace, pod_name) - if not is_success: - print 'Error in deleting pod %s' % pod_name - return False - - # Note: service name assumed to the the same as pod name - is_success = delete_service(kube_host, kube_port, namespace, pod_name) - if not is_success: - print 'Error in deleting service %s' % pod_name - return False - - print 'Successfully deleted the Pod/Service: %s' % pod_name - return True diff --git a/tools/gke/run_stress_tests_on_gke.py b/tools/gke/run_stress_tests_on_gke.py deleted file mode 100755 index d126e3db43..0000000000 --- a/tools/gke/run_stress_tests_on_gke.py +++ /dev/null @@ -1,556 +0,0 @@ -#!/usr/bin/env python2.7 -# Copyright 2015-2016, Google Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following disclaimer -# in the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Google Inc. nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import argparse -import datetime -import os -import subprocess -import sys -import time - -stress_test_utils_dir = os.path.abspath(os.path.join( - os.path.dirname(__file__), '../run_tests/stress_test')) -sys.path.append(stress_test_utils_dir) -from stress_test_utils import BigQueryHelper - -import kubernetes_api - -_GRPC_ROOT = os.path.abspath(os.path.join( - os.path.dirname(sys.argv[0]), '../..')) -os.chdir(_GRPC_ROOT) - -# num of seconds to wait for the GKE image to start and warmup -_GKE_IMAGE_WARMUP_WAIT_SECS = 60 - -_SERVER_POD_NAME = 'stress-server' -_CLIENT_POD_NAME_PREFIX = 'stress-client' -_DATASET_ID_PREFIX = 'stress_test' -_SUMMARY_TABLE_ID = 'summary' -_QPS_TABLE_ID = 'qps' - -_DEFAULT_DOCKER_IMAGE_NAME = 'grpc_stress_test' - -# The default port on which the kubernetes proxy server is started on localhost -# (i.e kubectl proxy --port=) -_DEFAULT_KUBERNETES_PROXY_PORT = 8001 - -# How frequently should the stress client wrapper script (running inside a GKE -# container) poll the health of the stress client (also running inside the GKE -# container) and upload metrics to BigQuery -_DEFAULT_STRESS_CLIENT_POLL_INTERVAL_SECS = 60 - -# The default setting for stress test server and client -_DEFAULT_STRESS_SERVER_PORT = 8080 -_DEFAULT_METRICS_PORT = 8081 -_DEFAULT_TEST_CASES_STR = 'empty_unary:1,large_unary:1,client_streaming:1,server_streaming:1,empty_stream:1' -_DEFAULT_NUM_CHANNELS_PER_SERVER = 5 -_DEFAULT_NUM_STUBS_PER_CHANNEL = 10 -_DEFAULT_METRICS_COLLECTION_INTERVAL_SECS = 30 - -# Number of stress client instances to launch -_DEFAULT_NUM_CLIENTS = 3 - -# How frequently should this test monitor the health of Stress clients and -# Servers running in GKE -_DEFAULT_TEST_POLL_INTERVAL_SECS = 60 - -# Default run time for this test (2 hour) -_DEFAULT_TEST_DURATION_SECS = 7200 - -# The number of seconds it would take a GKE pod to warm up (i.e get to 'Running' -# state from the time of creation). Ideally this is something the test should -# automatically determine by using Kubernetes API to poll the pods status. -_DEFAULT_GKE_WARMUP_SECS = 60 - - -class KubernetesProxy: - """ Class to start a proxy on localhost to the Kubernetes API server """ - - def __init__(self, api_port): - self.port = api_port - self.p = None - self.started = False - - def start(self): - cmd = ['kubectl', 'proxy', '--port=%d' % self.port] - self.p = subprocess.Popen(args=cmd) - self.started = True - time.sleep(2) - print '..Started' - - def get_port(self): - return self.port - - def is_started(self): - return self.started - - def __del__(self): - if self.p is not None: - print 'Shutting down Kubernetes proxy..' - self.p.kill() - - -class TestSettings: - - def __init__(self, build_docker_image, test_poll_interval_secs, - test_duration_secs, kubernetes_proxy_port): - self.build_docker_image = build_docker_image - self.test_poll_interval_secs = test_poll_interval_secs - self.test_duration_secs = test_duration_secs - self.kubernetes_proxy_port = kubernetes_proxy_port - - -class GkeSettings: - - def __init__(self, project_id, docker_image_name): - self.project_id = project_id - self.docker_image_name = docker_image_name - self.tag_name = 'gcr.io/%s/%s' % (project_id, docker_image_name) - - -class BigQuerySettings: - - def __init__(self, run_id, dataset_id, summary_table_id, qps_table_id): - self.run_id = run_id - self.dataset_id = dataset_id - self.summary_table_id = summary_table_id - self.qps_table_id = qps_table_id - - -class StressServerSettings: - - def __init__(self, server_pod_name, server_port): - self.server_pod_name = server_pod_name - self.server_port = server_port - - -class StressClientSettings: - - def __init__(self, num_clients, client_pod_name_prefix, server_pod_name, - server_port, metrics_port, metrics_collection_interval_secs, - stress_client_poll_interval_secs, num_channels_per_server, - num_stubs_per_channel, test_cases_str): - self.num_clients = num_clients - self.client_pod_name_prefix = client_pod_name_prefix - self.server_pod_name = server_pod_name - self.server_port = server_port - self.metrics_port = metrics_port - self.metrics_collection_interval_secs = metrics_collection_interval_secs - self.stress_client_poll_interval_secs = stress_client_poll_interval_secs - self.num_channels_per_server = num_channels_per_server - self.num_stubs_per_channel = num_stubs_per_channel - self.test_cases_str = test_cases_str - - # == Derived properties == - # Note: Client can accept a list of server addresses (a comma separated list - # of 'server_name:server_port'). In this case, we only have one server - # address to pass - self.server_addresses = '%s.default.svc.cluster.local:%d' % ( - server_pod_name, server_port) - self.client_pod_names_list = ['%s-%d' % (client_pod_name_prefix, i) - for i in range(1, num_clients + 1)] - - -def _build_docker_image(image_name, tag_name): - """ Build the docker image and add tag it to the GKE repository """ - print 'Building docker image: %s' % image_name - os.environ['INTEROP_IMAGE'] = image_name - os.environ['INTEROP_IMAGE_REPOSITORY_TAG'] = tag_name - # Note that 'BASE_NAME' HAS to be 'grpc_interop_stress_cxx' since the script - # build_interop_stress_image.sh invokes the following script: - # tools/dockerfile/$BASE_NAME/build_interop_stress.sh - os.environ['BASE_NAME'] = 'grpc_interop_stress_cxx' - cmd = ['tools/jenkins/build_interop_stress_image.sh'] - retcode = subprocess.call(args=cmd) - if retcode != 0: - print 'Error in building docker image' - return False - return True - - -def _push_docker_image_to_gke_registry(docker_tag_name): - """Executes 'gcloud docker push ' to push the image to GKE registry""" - cmd = ['gcloud', 'docker', 'push', docker_tag_name] - print 'Pushing %s to GKE registry..' % docker_tag_name - retcode = subprocess.call(args=cmd) - if retcode != 0: - print 'Error in pushing docker image %s to the GKE registry' % docker_tag_name - return False - return True - - -def _launch_server(gke_settings, stress_server_settings, bq_settings, - kubernetes_proxy): - """ Launches a stress test server instance in GKE cluster """ - if not kubernetes_proxy.is_started: - print 'Kubernetes proxy must be started before calling this function' - return False - - # This is the wrapper script that is run in the container. This script runs - # the actual stress test server - server_cmd_list = [ - '/var/local/git/grpc/tools/run_tests/stress_test/run_server.py' - ] - - # run_server.py does not take any args from the command line. The args are - # instead passed via environment variables (see server_env below) - server_arg_list = [] - - # The parameters to the script run_server.py are injected into the container - # via environment variables - server_env = { - 'STRESS_TEST_IMAGE_TYPE': 'SERVER', - 'STRESS_TEST_IMAGE': '/var/local/git/grpc/bins/opt/interop_server', - 'STRESS_TEST_ARGS_STR': '--port=%s' % stress_server_settings.server_port, - 'RUN_ID': bq_settings.run_id, - 'POD_NAME': stress_server_settings.server_pod_name, - 'GCP_PROJECT_ID': gke_settings.project_id, - 'DATASET_ID': bq_settings.dataset_id, - 'SUMMARY_TABLE_ID': bq_settings.summary_table_id, - 'QPS_TABLE_ID': bq_settings.qps_table_id - } - - # Launch Server - is_success = kubernetes_api.create_pod_and_service( - 'localhost', - kubernetes_proxy.get_port(), - 'default', # Use 'default' namespace - stress_server_settings.server_pod_name, - gke_settings.tag_name, - [stress_server_settings.server_port], # Port that should be exposed - server_cmd_list, - server_arg_list, - server_env, - True # Headless = True for server. Since we want DNS records to be created by GKE - ) - - return is_success - - -def _launch_client(gke_settings, stress_server_settings, stress_client_settings, - bq_settings, kubernetes_proxy): - """ Launches a configurable number of stress test clients on GKE cluster """ - if not kubernetes_proxy.is_started: - print 'Kubernetes proxy must be started before calling this function' - return False - - stress_client_arg_list = [ - '--server_addresses=%s' % stress_client_settings.server_addresses, - '--test_cases=%s' % stress_client_settings.test_cases_str, - '--num_stubs_per_channel=%d' % - stress_client_settings.num_stubs_per_channel - ] - - # This is the wrapper script that is run in the container. This script runs - # the actual stress client - client_cmd_list = [ - '/var/local/git/grpc/tools/run_tests/stress_test/run_client.py' - ] - - # run_client.py takes no args. All args are passed as env variables (see - # client_env) - client_arg_list = [] - - metrics_server_address = 'localhost:%d' % stress_client_settings.metrics_port - metrics_client_arg_list = [ - '--metrics_server_address=%s' % metrics_server_address, - '--total_only=true' - ] - - # The parameters to the script run_client.py are injected into the container - # via environment variables - client_env = { - 'STRESS_TEST_IMAGE_TYPE': 'CLIENT', - 'STRESS_TEST_IMAGE': '/var/local/git/grpc/bins/opt/stress_test', - 'STRESS_TEST_ARGS_STR': ' '.join(stress_client_arg_list), - 'METRICS_CLIENT_IMAGE': '/var/local/git/grpc/bins/opt/metrics_client', - 'METRICS_CLIENT_ARGS_STR': ' '.join(metrics_client_arg_list), - 'RUN_ID': bq_settings.run_id, - 'POLL_INTERVAL_SECS': - str(stress_client_settings.stress_client_poll_interval_secs), - 'GCP_PROJECT_ID': gke_settings.project_id, - 'DATASET_ID': bq_settings.dataset_id, - 'SUMMARY_TABLE_ID': bq_settings.summary_table_id, - 'QPS_TABLE_ID': bq_settings.qps_table_id - } - - for pod_name in stress_client_settings.client_pod_names_list: - client_env['POD_NAME'] = pod_name - is_success = kubernetes_api.create_pod_and_service( - 'localhost', # Since proxy is running on localhost - kubernetes_proxy.get_port(), - 'default', # default namespace - pod_name, - gke_settings.tag_name, - [stress_client_settings.metrics_port - ], # Client pods expose metrics port - client_cmd_list, - client_arg_list, - client_env, - False # Client is not a headless service - ) - if not is_success: - print 'Error in launching client %s' % pod_name - return False - - return True - - -def _launch_server_and_client(gke_settings, stress_server_settings, - stress_client_settings, bq_settings, - kubernetes_proxy_port): - # Start kubernetes proxy - print 'Kubernetes proxy' - kubernetes_proxy = KubernetesProxy(kubernetes_proxy_port) - kubernetes_proxy.start() - - print 'Launching server..' - is_success = _launch_server(gke_settings, stress_server_settings, bq_settings, - kubernetes_proxy) - if not is_success: - print 'Error in launching server' - return False - - # Server takes a while to start. - # TODO(sree) Use Kubernetes API to query the status of the server instead of - # sleeping - print 'Waiting for %s seconds for the server to start...' % _GKE_IMAGE_WARMUP_WAIT_SECS - time.sleep(_GKE_IMAGE_WARMUP_WAIT_SECS) - - # Launch client - client_pod_name_prefix = 'stress-client' - is_success = _launch_client(gke_settings, stress_server_settings, - stress_client_settings, bq_settings, - kubernetes_proxy) - - if not is_success: - print 'Error in launching client(s)' - return False - - print 'Waiting for %s seconds for the client images to start...' % _GKE_IMAGE_WARMUP_WAIT_SECS - time.sleep(_GKE_IMAGE_WARMUP_WAIT_SECS) - return True - - -def _delete_server_and_client(stress_server_settings, stress_client_settings, - kubernetes_proxy_port): - kubernetes_proxy = KubernetesProxy(kubernetes_proxy_port) - kubernetes_proxy.start() - - # Delete clients first - is_success = True - for pod_name in stress_client_settings.client_pod_names_list: - is_success = kubernetes_api.delete_pod_and_service( - 'localhost', kubernetes_proxy_port, 'default', pod_name) - if not is_success: - return False - - # Delete server - is_success = kubernetes_api.delete_pod_and_service( - 'localhost', kubernetes_proxy_port, 'default', - stress_server_settings.server_pod_name) - return is_success - - -def run_test_main(test_settings, gke_settings, stress_server_settings, - stress_client_clients): - is_success = True - - if test_settings.build_docker_image: - is_success = _build_docker_image(gke_settings.docker_image_name, - gke_settings.tag_name) - if not is_success: - return False - - is_success = _push_docker_image_to_gke_registry(gke_settings.tag_name) - if not is_success: - return False - - # Create a unique id for this run (Note: Using timestamp instead of UUID to - # make it easier to deduce the date/time of the run just by looking at the run - # run id. This is useful in debugging when looking at records in Biq query) - run_id = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') - dataset_id = '%s_%s' % (_DATASET_ID_PREFIX, run_id) - - # Big Query settings (common for both Stress Server and Client) - bq_settings = BigQuerySettings(run_id, dataset_id, _SUMMARY_TABLE_ID, - _QPS_TABLE_ID) - - bq_helper = BigQueryHelper(run_id, '', '', args.project_id, dataset_id, - _SUMMARY_TABLE_ID, _QPS_TABLE_ID) - bq_helper.initialize() - - try: - is_success = _launch_server_and_client(gke_settings, stress_server_settings, - stress_client_settings, bq_settings, - test_settings.kubernetes_proxy_port) - if not is_success: - return False - - start_time = datetime.datetime.now() - end_time = start_time + datetime.timedelta( - seconds=test_settings.test_duration_secs) - print 'Running the test until %s' % end_time.isoformat() - - while True: - if datetime.datetime.now() > end_time: - print 'Test was run for %d seconds' % test_settings.test_duration_secs - break - - # Check if either stress server or clients have failed - if bq_helper.check_if_any_tests_failed(): - is_success = False - print 'Some tests failed.' - break - - # Things seem to be running fine. Wait until next poll time to check the - # status - print 'Sleeping for %d seconds..' % test_settings.test_poll_interval_secs - time.sleep(test_settings.test_poll_interval_secs) - - # Print BiqQuery tables - bq_helper.print_summary_records() - bq_helper.print_qps_records() - - finally: - # If is_success is False at this point, it means that the stress tests were - # started successfully but failed while running the tests. In this case we - # do should not delete the pods (since they contain all the failure - # information) - if is_success: - _delete_server_and_client(stress_server_settings, stress_client_settings, - test_settings.kubernetes_proxy_port) - - return is_success - - -argp = argparse.ArgumentParser( - description='Launch stress tests in GKE', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) -argp.add_argument('--project_id', - required=True, - help='The Google Cloud Platform Project Id') -argp.add_argument('--num_clients', - default=1, - type=int, - help='Number of client instances to start') -argp.add_argument('--docker_image_name', - default=_DEFAULT_DOCKER_IMAGE_NAME, - help='The name of the docker image containing stress client ' - 'and stress servers') -argp.add_argument('--build_docker_image', - dest='build_docker_image', - action='store_true', - help='Build a docker image and push to Google Container ' - 'Registry') -argp.add_argument('--do_not_build_docker_image', - dest='build_docker_image', - action='store_false', - help='Do not build and push docker image to Google Container ' - 'Registry') -argp.set_defaults(build_docker_image=True) - -argp.add_argument('--test_poll_interval_secs', - default=_DEFAULT_TEST_POLL_INTERVAL_SECS, - type=int, - help='How frequently should this script should monitor the ' - 'health of stress clients and servers running in the GKE ' - 'cluster') -argp.add_argument('--test_duration_secs', - default=_DEFAULT_TEST_DURATION_SECS, - type=int, - help='How long should this test be run') -argp.add_argument('--kubernetes_proxy_port', - default=_DEFAULT_KUBERNETES_PROXY_PORT, - type=int, - help='The port on which the kubernetes proxy (on localhost)' - ' is started') -argp.add_argument('--stress_server_port', - default=_DEFAULT_STRESS_SERVER_PORT, - type=int, - help='The port on which the stress server (in GKE ' - 'containers) listens') -argp.add_argument('--stress_client_metrics_port', - default=_DEFAULT_METRICS_PORT, - type=int, - help='The port on which the stress clients (in GKE ' - 'containers) expose metrics') -argp.add_argument('--stress_client_poll_interval_secs', - default=_DEFAULT_STRESS_CLIENT_POLL_INTERVAL_SECS, - type=int, - help='How frequently should the stress client wrapper script' - ' running inside GKE should monitor health of the actual ' - ' stress client process and upload the metrics to BigQuery') -argp.add_argument('--stress_client_metrics_collection_interval_secs', - default=_DEFAULT_METRICS_COLLECTION_INTERVAL_SECS, - type=int, - help='How frequently should metrics be collected in-memory on' - ' the stress clients (running inside GKE containers). Note ' - 'that this is NOT the same as the upload-to-BigQuery ' - 'frequency. The metrics upload frequency is controlled by the' - ' --stress_client_poll_interval_secs flag') -argp.add_argument('--stress_client_num_channels_per_server', - default=_DEFAULT_NUM_CHANNELS_PER_SERVER, - type=int, - help='The number of channels created to each server from a ' - 'stress client') -argp.add_argument('--stress_client_num_stubs_per_channel', - default=_DEFAULT_NUM_STUBS_PER_CHANNEL, - type=int, - help='The number of stubs created per channel. This number ' - 'indicates the max number of RPCs that can be made in ' - 'parallel on each channel at any given time') -argp.add_argument('--stress_client_test_cases', - default=_DEFAULT_TEST_CASES_STR, - help='List of test cases (with weights) to be executed by the' - ' stress test client. The list is in the following format:\n' - ' ..\n' - ' (Note: The weights do not have to add up to 100)') - -if __name__ == '__main__': - args = argp.parse_args() - - test_settings = TestSettings( - args.build_docker_image, args.test_poll_interval_secs, - args.test_duration_secs, args.kubernetes_proxy_port) - - gke_settings = GkeSettings(args.project_id, args.docker_image_name) - - stress_server_settings = StressServerSettings(_SERVER_POD_NAME, - args.stress_server_port) - stress_client_settings = StressClientSettings( - args.num_clients, _CLIENT_POD_NAME_PREFIX, _SERVER_POD_NAME, - args.stress_server_port, args.stress_client_metrics_port, - args.stress_client_metrics_collection_interval_secs, - args.stress_client_poll_interval_secs, - args.stress_client_num_channels_per_server, - args.stress_client_num_stubs_per_channel, args.stress_client_test_cases) - - run_test_main(test_settings, gke_settings, stress_server_settings, - stress_client_settings) diff --git a/tools/jenkins/build_interop_stress_image.sh b/tools/jenkins/build_interop_stress_image.sh index 4c8e998a8a..501dc5b7ca 100755 --- a/tools/jenkins/build_interop_stress_image.sh +++ b/tools/jenkins/build_interop_stress_image.sh @@ -79,7 +79,7 @@ CONTAINER_NAME="build_${BASE_NAME}_$(uuidgen)" $BASE_IMAGE \ bash -l /var/local/jenkins/grpc/tools/dockerfile/$BASE_NAME/build_interop_stress.sh \ && docker commit $CONTAINER_NAME $INTEROP_IMAGE \ - && ( if [ -n $INTEROP_IMAGE_REPOSITORY_TAG ]; then docker tag $INTEROP_IMAGE $INTEROP_IMAGE_REPOSITORY_TAG ; fi ) \ + && ( if [ -n "$INTEROP_IMAGE_REPOSITORY_TAG" ]; then docker tag -f $INTEROP_IMAGE $INTEROP_IMAGE_REPOSITORY_TAG ; fi ) \ && echo "Successfully built image $INTEROP_IMAGE") EXITCODE=$? diff --git a/tools/run_tests/stress_test/run_client.py b/tools/run_tests/stress_test/run_client.py deleted file mode 100755 index 0fa1bf1cb9..0000000000 --- a/tools/run_tests/stress_test/run_client.py +++ /dev/null @@ -1,187 +0,0 @@ -#!/usr/bin/env python2.7 -# Copyright 2015-2016, Google Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following disclaimer -# in the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Google Inc. nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import datetime -import os -import re -import select -import subprocess -import sys -import time - -from stress_test_utils import EventType -from stress_test_utils import BigQueryHelper - - -# TODO (sree): Write a python grpc client to directly query the metrics instead -# of calling metrics_client -def _get_qps(metrics_cmd): - qps = 0 - try: - # Note: gpr_log() writes even non-error messages to stderr stream. So it is - # important that we set stderr=subprocess.STDOUT - p = subprocess.Popen(args=metrics_cmd, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) - retcode = p.wait() - (out_str, err_str) = p.communicate() - if retcode != 0: - print 'Error in reading metrics information' - print 'Output: ', out_str - else: - # The overall qps is printed at the end of the line - m = re.search('\d+$', out_str) - qps = int(m.group()) if m else 0 - except Exception as ex: - print 'Exception while reading metrics information: ' + str(ex) - return qps - - -def run_client(): - """This is a wrapper around the stress test client and performs the following: - 1) Create the following two tables in Big Query: - (i) Summary table: To record events like the test started, completed - successfully or failed - (ii) Qps table: To periodically record the QPS sent by this client - 2) Start the stress test client and add a row in the Big Query summary - table - 3) Once every few seconds (as specificed by the poll_interval_secs) poll - the status of the stress test client process and perform the - following: - 3.1) If the process is still running, get the current qps by invoking - the metrics client program and add a row in the Big Query - Qps table. Sleep for a duration specified by poll_interval_secs - 3.2) If the process exited successfully, add a row in the Big Query - Summary table and exit - 3.3) If the process failed, add a row in Big Query summary table and - wait forever. - NOTE: This script typically runs inside a GKE pod which means - that the pod gets destroyed when the script exits. However, in - case the stress test client fails, we would not want the pod to - be destroyed (since we might want to connect to the pod for - examining logs). This is the reason why the script waits forever - in case of failures - """ - env = dict(os.environ) - image_type = env['STRESS_TEST_IMAGE_TYPE'] - image_name = env['STRESS_TEST_IMAGE'] - args_str = env['STRESS_TEST_ARGS_STR'] - metrics_client_image = env['METRICS_CLIENT_IMAGE'] - metrics_client_args_str = env['METRICS_CLIENT_ARGS_STR'] - run_id = env['RUN_ID'] - pod_name = env['POD_NAME'] - logfile_name = env.get('LOGFILE_NAME') - poll_interval_secs = float(env['POLL_INTERVAL_SECS']) - project_id = env['GCP_PROJECT_ID'] - dataset_id = env['DATASET_ID'] - summary_table_id = env['SUMMARY_TABLE_ID'] - qps_table_id = env['QPS_TABLE_ID'] - - bq_helper = BigQueryHelper(run_id, image_type, pod_name, project_id, - dataset_id, summary_table_id, qps_table_id) - bq_helper.initialize() - - # Create BigQuery Dataset and Tables: Summary Table and Metrics Table - if not bq_helper.setup_tables(): - print 'Error in creating BigQuery tables' - return - - start_time = datetime.datetime.now() - - logfile = None - details = 'Logging to stdout' - if logfile_name is not None: - print 'Opening logfile: %s ...' % logfile_name - details = 'Logfile: %s' % logfile_name - logfile = open(logfile_name, 'w') - - # Update status that the test is starting (in the status table) - bq_helper.insert_summary_row(EventType.STARTING, details) - - metrics_cmd = [metrics_client_image - ] + [x for x in metrics_client_args_str.split()] - stress_cmd = [image_name] + [x for x in args_str.split()] - - print 'Launching process %s ...' % stress_cmd - stress_p = subprocess.Popen(args=stress_cmd, - stdout=logfile, - stderr=subprocess.STDOUT) - - qps_history = [1, 1, 1] # Maintain the last 3 qps readings - qps_history_idx = 0 # Index into the qps_history list - - is_error = False - while True: - # Check if stress_client is still running. If so, collect metrics and upload - # to BigQuery status table - if stress_p.poll() is not None: - end_time = datetime.datetime.now().isoformat() - event_type = EventType.SUCCESS - details = 'End time: %s' % end_time - if stress_p.returncode != 0: - event_type = EventType.FAILURE - details = 'Return code = %d. End time: %s' % (stress_p.returncode, - end_time) - is_error = True - bq_helper.insert_summary_row(event_type, details) - print details - break - - # Stress client still running. Get metrics - qps = _get_qps(metrics_cmd) - qps_recorded_at = datetime.datetime.now().isoformat() - print 'qps: %d at %s' % (qps, qps_recorded_at) - - # If QPS has been zero for the last 3 iterations, flag it as error and exit - qps_history[qps_history_idx] = qps - qps_history_idx = (qps_history_idx + 1) % len(qps_history) - if sum(qps_history) == 0: - details = 'QPS has been zero for the last %d seconds - as of : %s' % ( - poll_interval_secs * 3, qps_recorded_at) - is_error = True - bq_helper.insert_summary_row(EventType.FAILURE, details) - print details - break - - # Upload qps metrics to BiqQuery - bq_helper.insert_qps_row(qps, qps_recorded_at) - - time.sleep(poll_interval_secs) - - if is_error: - print 'Waiting indefinitely..' - select.select([], [], []) - - print 'Completed' - return - - -if __name__ == '__main__': - run_client() diff --git a/tools/run_tests/stress_test/run_server.py b/tools/run_tests/stress_test/run_server.py deleted file mode 100755 index 64322f6100..0000000000 --- a/tools/run_tests/stress_test/run_server.py +++ /dev/null @@ -1,120 +0,0 @@ -#!/usr/bin/env python2.7 -# Copyright 2015-2016, Google Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following disclaimer -# in the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Google Inc. nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import datetime -import os -import select -import subprocess -import sys -import time - -from stress_test_utils import BigQueryHelper -from stress_test_utils import EventType - - -def run_server(): - """This is a wrapper around the interop server and performs the following: - 1) Create a 'Summary table' in Big Query to record events like the server - started, completed successfully or failed. NOTE: This also creates - another table called the QPS table which is currently NOT needed on the - server (it is needed on the stress test clients) - 2) Start the server process and add a row in Big Query summary table - 3) Wait for the server process to terminate. The server process does not - terminate unless there is an error. - If the server process terminated with a failure, add a row in Big Query - and wait forever. - NOTE: This script typically runs inside a GKE pod which means that the - pod gets destroyed when the script exits. However, in case the server - process fails, we would not want the pod to be destroyed (since we - might want to connect to the pod for examining logs). This is the - reason why the script waits forever in case of failures. - """ - - # Read the parameters from environment variables - env = dict(os.environ) - - run_id = env['RUN_ID'] # The unique run id for this test - image_type = env['STRESS_TEST_IMAGE_TYPE'] - image_name = env['STRESS_TEST_IMAGE'] - args_str = env['STRESS_TEST_ARGS_STR'] - pod_name = env['POD_NAME'] - project_id = env['GCP_PROJECT_ID'] - dataset_id = env['DATASET_ID'] - summary_table_id = env['SUMMARY_TABLE_ID'] - qps_table_id = env['QPS_TABLE_ID'] - - logfile_name = env.get('LOGFILE_NAME') - - print('pod_name: %s, project_id: %s, run_id: %s, dataset_id: %s, ' - 'summary_table_id: %s, qps_table_id: %s') % ( - pod_name, project_id, run_id, dataset_id, summary_table_id, - qps_table_id) - - bq_helper = BigQueryHelper(run_id, image_type, pod_name, project_id, - dataset_id, summary_table_id, qps_table_id) - bq_helper.initialize() - - # Create BigQuery Dataset and Tables: Summary Table and Metrics Table - if not bq_helper.setup_tables(): - print 'Error in creating BigQuery tables' - return - - start_time = datetime.datetime.now() - - logfile = None - details = 'Logging to stdout' - if logfile_name is not None: - print 'Opening log file: ', logfile_name - logfile = open(logfile_name, 'w') - details = 'Logfile: %s' % logfile_name - - # Update status that the test is starting (in the status table) - bq_helper.insert_summary_row(EventType.STARTING, details) - - stress_cmd = [image_name] + [x for x in args_str.split()] - - print 'Launching process %s ...' % stress_cmd - stress_p = subprocess.Popen(args=stress_cmd, - stdout=logfile, - stderr=subprocess.STDOUT) - - returncode = stress_p.wait() - if returncode != 0: - end_time = datetime.datetime.now().isoformat() - event_type = EventType.FAILURE - details = 'Returncode: %d; End time: %s' % (returncode, end_time) - bq_helper.insert_summary_row(event_type, details) - print 'Waiting indefinitely..' - select.select([], [], []) - return returncode - - -if __name__ == '__main__': - run_server() diff --git a/tools/run_tests/stress_test/run_stress_tests_on_gke.py b/tools/run_tests/stress_test/run_stress_tests_on_gke.py new file mode 100755 index 0000000000..634eb1aca5 --- /dev/null +++ b/tools/run_tests/stress_test/run_stress_tests_on_gke.py @@ -0,0 +1,556 @@ +#!/usr/bin/env python2.7 +# Copyright 2015-2016, Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import argparse +import datetime +import os +import subprocess +import sys +import time + +stress_test_utils_dir = os.path.abspath(os.path.join( + os.path.dirname(__file__), '../../gcp/stress_test')) +sys.path.append(stress_test_utils_dir) +from stress_test_utils import BigQueryHelper + +kubernetes_api_dir = os.path.abspath(os.path.join( + os.path.dirname(__file__), '../../gcp/utils')) +sys.path.append(kubernetes_api_dir) + +import kubernetes_api + +_GRPC_ROOT = os.path.abspath(os.path.join( + os.path.dirname(sys.argv[0]), '../../..')) +os.chdir(_GRPC_ROOT) + +# num of seconds to wait for the GKE image to start and warmup +_GKE_IMAGE_WARMUP_WAIT_SECS = 60 + +_SERVER_POD_NAME = 'stress-server' +_CLIENT_POD_NAME_PREFIX = 'stress-client' +_DATASET_ID_PREFIX = 'stress_test' +_SUMMARY_TABLE_ID = 'summary' +_QPS_TABLE_ID = 'qps' + +_DEFAULT_DOCKER_IMAGE_NAME = 'grpc_stress_test' + +# The default port on which the kubernetes proxy server is started on localhost +# (i.e kubectl proxy --port=) +_DEFAULT_KUBERNETES_PROXY_PORT = 8001 + +# How frequently should the stress client wrapper script (running inside a GKE +# container) poll the health of the stress client (also running inside the GKE +# container) and upload metrics to BigQuery +_DEFAULT_STRESS_CLIENT_POLL_INTERVAL_SECS = 60 + +# The default setting for stress test server and client +_DEFAULT_STRESS_SERVER_PORT = 8080 +_DEFAULT_METRICS_PORT = 8081 +_DEFAULT_TEST_CASES_STR = 'empty_unary:1,large_unary:1,client_streaming:1,server_streaming:1,empty_stream:1' +_DEFAULT_NUM_CHANNELS_PER_SERVER = 5 +_DEFAULT_NUM_STUBS_PER_CHANNEL = 10 +_DEFAULT_METRICS_COLLECTION_INTERVAL_SECS = 30 + +# Number of stress client instances to launch +_DEFAULT_NUM_CLIENTS = 3 + +# How frequently should this test monitor the health of Stress clients and +# Servers running in GKE +_DEFAULT_TEST_POLL_INTERVAL_SECS = 60 + +# Default run time for this test (2 hour) +_DEFAULT_TEST_DURATION_SECS = 7200 + +# The number of seconds it would take a GKE pod to warm up (i.e get to 'Running' +# state from the time of creation). Ideally this is something the test should +# automatically determine by using Kubernetes API to poll the pods status. +_DEFAULT_GKE_WARMUP_SECS = 60 + + +class KubernetesProxy: + """ Class to start a proxy on localhost to the Kubernetes API server """ + + def __init__(self, api_port): + self.port = api_port + self.p = None + self.started = False + + def start(self): + cmd = ['kubectl', 'proxy', '--port=%d' % self.port] + self.p = subprocess.Popen(args=cmd) + self.started = True + time.sleep(2) + print '..Started' + + def get_port(self): + return self.port + + def is_started(self): + return self.started + + def __del__(self): + if self.p is not None: + print 'Shutting down Kubernetes proxy..' + self.p.kill() + + +class TestSettings: + + def __init__(self, build_docker_image, test_poll_interval_secs, + test_duration_secs, kubernetes_proxy_port): + self.build_docker_image = build_docker_image + self.test_poll_interval_secs = test_poll_interval_secs + self.test_duration_secs = test_duration_secs + self.kubernetes_proxy_port = kubernetes_proxy_port + + +class GkeSettings: + + def __init__(self, project_id, docker_image_name): + self.project_id = project_id + self.docker_image_name = docker_image_name + self.tag_name = 'gcr.io/%s/%s' % (project_id, docker_image_name) + + +class BigQuerySettings: + + def __init__(self, run_id, dataset_id, summary_table_id, qps_table_id): + self.run_id = run_id + self.dataset_id = dataset_id + self.summary_table_id = summary_table_id + self.qps_table_id = qps_table_id + + +class StressServerSettings: + + def __init__(self, server_pod_name, server_port): + self.server_pod_name = server_pod_name + self.server_port = server_port + + +class StressClientSettings: + + def __init__(self, num_clients, client_pod_name_prefix, server_pod_name, + server_port, metrics_port, metrics_collection_interval_secs, + stress_client_poll_interval_secs, num_channels_per_server, + num_stubs_per_channel, test_cases_str): + self.num_clients = num_clients + self.client_pod_name_prefix = client_pod_name_prefix + self.server_pod_name = server_pod_name + self.server_port = server_port + self.metrics_port = metrics_port + self.metrics_collection_interval_secs = metrics_collection_interval_secs + self.stress_client_poll_interval_secs = stress_client_poll_interval_secs + self.num_channels_per_server = num_channels_per_server + self.num_stubs_per_channel = num_stubs_per_channel + self.test_cases_str = test_cases_str + + # == Derived properties == + # Note: Client can accept a list of server addresses (a comma separated list + # of 'server_name:server_port'). In this case, we only have one server + # address to pass + self.server_addresses = '%s.default.svc.cluster.local:%d' % ( + server_pod_name, server_port) + self.client_pod_names_list = ['%s-%d' % (client_pod_name_prefix, i) + for i in range(1, num_clients + 1)] + + +def _build_docker_image(image_name, tag_name): + """ Build the docker image and add tag it to the GKE repository """ + print 'Building docker image: %s' % image_name + os.environ['INTEROP_IMAGE'] = image_name + os.environ['INTEROP_IMAGE_REPOSITORY_TAG'] = tag_name + # Note that 'BASE_NAME' HAS to be 'grpc_interop_stress_cxx' since the script + # build_interop_stress_image.sh invokes the following script: + # tools/dockerfile/$BASE_NAME/build_interop_stress.sh + os.environ['BASE_NAME'] = 'grpc_interop_stress_cxx' + cmd = ['tools/jenkins/build_interop_stress_image.sh'] + retcode = subprocess.call(args=cmd) + if retcode != 0: + print 'Error in building docker image' + return False + return True + + +def _push_docker_image_to_gke_registry(docker_tag_name): + """Executes 'gcloud docker push ' to push the image to GKE registry""" + cmd = ['gcloud', 'docker', 'push', docker_tag_name] + print 'Pushing %s to GKE registry..' % docker_tag_name + retcode = subprocess.call(args=cmd) + if retcode != 0: + print 'Error in pushing docker image %s to the GKE registry' % docker_tag_name + return False + return True + + +def _launch_server(gke_settings, stress_server_settings, bq_settings, + kubernetes_proxy): + """ Launches a stress test server instance in GKE cluster """ + if not kubernetes_proxy.is_started: + print 'Kubernetes proxy must be started before calling this function' + return False + + # This is the wrapper script that is run in the container. This script runs + # the actual stress test server + server_cmd_list = ['/var/local/git/grpc/tools/gcp/stress_test/run_server.py'] + + # run_server.py does not take any args from the command line. The args are + # instead passed via environment variables (see server_env below) + server_arg_list = [] + + # The parameters to the script run_server.py are injected into the container + # via environment variables + server_env = { + 'STRESS_TEST_IMAGE_TYPE': 'SERVER', + 'STRESS_TEST_IMAGE': '/var/local/git/grpc/bins/opt/interop_server', + 'STRESS_TEST_ARGS_STR': '--port=%s' % stress_server_settings.server_port, + 'RUN_ID': bq_settings.run_id, + 'POD_NAME': stress_server_settings.server_pod_name, + 'GCP_PROJECT_ID': gke_settings.project_id, + 'DATASET_ID': bq_settings.dataset_id, + 'SUMMARY_TABLE_ID': bq_settings.summary_table_id, + 'QPS_TABLE_ID': bq_settings.qps_table_id + } + + # Launch Server + is_success = kubernetes_api.create_pod_and_service( + 'localhost', + kubernetes_proxy.get_port(), + 'default', # Use 'default' namespace + stress_server_settings.server_pod_name, + gke_settings.tag_name, + [stress_server_settings.server_port], # Port that should be exposed + server_cmd_list, + server_arg_list, + server_env, + True # Headless = True for server. Since we want DNS records to be created by GKE + ) + + return is_success + + +def _launch_client(gke_settings, stress_server_settings, stress_client_settings, + bq_settings, kubernetes_proxy): + """ Launches a configurable number of stress test clients on GKE cluster """ + if not kubernetes_proxy.is_started: + print 'Kubernetes proxy must be started before calling this function' + return False + + stress_client_arg_list = [ + '--server_addresses=%s' % stress_client_settings.server_addresses, + '--test_cases=%s' % stress_client_settings.test_cases_str, + '--num_stubs_per_channel=%d' % + stress_client_settings.num_stubs_per_channel + ] + + # This is the wrapper script that is run in the container. This script runs + # the actual stress client + client_cmd_list = ['/var/local/git/grpc/tools/gcp/stress_test/run_client.py'] + + # run_client.py takes no args. All args are passed as env variables (see + # client_env) + client_arg_list = [] + + metrics_server_address = 'localhost:%d' % stress_client_settings.metrics_port + metrics_client_arg_list = [ + '--metrics_server_address=%s' % metrics_server_address, + '--total_only=true' + ] + + # The parameters to the script run_client.py are injected into the container + # via environment variables + client_env = { + 'STRESS_TEST_IMAGE_TYPE': 'CLIENT', + 'STRESS_TEST_IMAGE': '/var/local/git/grpc/bins/opt/stress_test', + 'STRESS_TEST_ARGS_STR': ' '.join(stress_client_arg_list), + 'METRICS_CLIENT_IMAGE': '/var/local/git/grpc/bins/opt/metrics_client', + 'METRICS_CLIENT_ARGS_STR': ' '.join(metrics_client_arg_list), + 'RUN_ID': bq_settings.run_id, + 'POLL_INTERVAL_SECS': + str(stress_client_settings.stress_client_poll_interval_secs), + 'GCP_PROJECT_ID': gke_settings.project_id, + 'DATASET_ID': bq_settings.dataset_id, + 'SUMMARY_TABLE_ID': bq_settings.summary_table_id, + 'QPS_TABLE_ID': bq_settings.qps_table_id + } + + for pod_name in stress_client_settings.client_pod_names_list: + client_env['POD_NAME'] = pod_name + is_success = kubernetes_api.create_pod_and_service( + 'localhost', # Since proxy is running on localhost + kubernetes_proxy.get_port(), + 'default', # default namespace + pod_name, + gke_settings.tag_name, + [stress_client_settings.metrics_port + ], # Client pods expose metrics port + client_cmd_list, + client_arg_list, + client_env, + False # Client is not a headless service + ) + if not is_success: + print 'Error in launching client %s' % pod_name + return False + + return True + + +def _launch_server_and_client(gke_settings, stress_server_settings, + stress_client_settings, bq_settings, + kubernetes_proxy_port): + # Start kubernetes proxy + print 'Kubernetes proxy' + kubernetes_proxy = KubernetesProxy(kubernetes_proxy_port) + kubernetes_proxy.start() + + print 'Launching server..' + is_success = _launch_server(gke_settings, stress_server_settings, bq_settings, + kubernetes_proxy) + if not is_success: + print 'Error in launching server' + return False + + # Server takes a while to start. + # TODO(sree) Use Kubernetes API to query the status of the server instead of + # sleeping + print 'Waiting for %s seconds for the server to start...' % _GKE_IMAGE_WARMUP_WAIT_SECS + time.sleep(_GKE_IMAGE_WARMUP_WAIT_SECS) + + # Launch client + client_pod_name_prefix = 'stress-client' + is_success = _launch_client(gke_settings, stress_server_settings, + stress_client_settings, bq_settings, + kubernetes_proxy) + + if not is_success: + print 'Error in launching client(s)' + return False + + print 'Waiting for %s seconds for the client images to start...' % _GKE_IMAGE_WARMUP_WAIT_SECS + time.sleep(_GKE_IMAGE_WARMUP_WAIT_SECS) + return True + + +def _delete_server_and_client(stress_server_settings, stress_client_settings, + kubernetes_proxy_port): + kubernetes_proxy = KubernetesProxy(kubernetes_proxy_port) + kubernetes_proxy.start() + + # Delete clients first + is_success = True + for pod_name in stress_client_settings.client_pod_names_list: + is_success = kubernetes_api.delete_pod_and_service( + 'localhost', kubernetes_proxy_port, 'default', pod_name) + if not is_success: + return False + + # Delete server + is_success = kubernetes_api.delete_pod_and_service( + 'localhost', kubernetes_proxy_port, 'default', + stress_server_settings.server_pod_name) + return is_success + + +def run_test_main(test_settings, gke_settings, stress_server_settings, + stress_client_clients): + is_success = True + + if test_settings.build_docker_image: + is_success = _build_docker_image(gke_settings.docker_image_name, + gke_settings.tag_name) + if not is_success: + return False + + is_success = _push_docker_image_to_gke_registry(gke_settings.tag_name) + if not is_success: + return False + + # Create a unique id for this run (Note: Using timestamp instead of UUID to + # make it easier to deduce the date/time of the run just by looking at the run + # run id. This is useful in debugging when looking at records in Biq query) + run_id = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + dataset_id = '%s_%s' % (_DATASET_ID_PREFIX, run_id) + + # Big Query settings (common for both Stress Server and Client) + bq_settings = BigQuerySettings(run_id, dataset_id, _SUMMARY_TABLE_ID, + _QPS_TABLE_ID) + + bq_helper = BigQueryHelper(run_id, '', '', args.project_id, dataset_id, + _SUMMARY_TABLE_ID, _QPS_TABLE_ID) + bq_helper.initialize() + + try: + is_success = _launch_server_and_client(gke_settings, stress_server_settings, + stress_client_settings, bq_settings, + test_settings.kubernetes_proxy_port) + if not is_success: + return False + + start_time = datetime.datetime.now() + end_time = start_time + datetime.timedelta( + seconds=test_settings.test_duration_secs) + print 'Running the test until %s' % end_time.isoformat() + + while True: + if datetime.datetime.now() > end_time: + print 'Test was run for %d seconds' % test_settings.test_duration_secs + break + + # Check if either stress server or clients have failed + if bq_helper.check_if_any_tests_failed(): + is_success = False + print 'Some tests failed.' + break + + # Things seem to be running fine. Wait until next poll time to check the + # status + print 'Sleeping for %d seconds..' % test_settings.test_poll_interval_secs + time.sleep(test_settings.test_poll_interval_secs) + + # Print BiqQuery tables + bq_helper.print_summary_records() + bq_helper.print_qps_records() + + finally: + # If is_success is False at this point, it means that the stress tests were + # started successfully but failed while running the tests. In this case we + # do should not delete the pods (since they contain all the failure + # information) + if is_success: + _delete_server_and_client(stress_server_settings, stress_client_settings, + test_settings.kubernetes_proxy_port) + + return is_success + + +argp = argparse.ArgumentParser( + description='Launch stress tests in GKE', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +argp.add_argument('--project_id', + required=True, + help='The Google Cloud Platform Project Id') +argp.add_argument('--num_clients', + default=1, + type=int, + help='Number of client instances to start') +argp.add_argument('--docker_image_name', + default=_DEFAULT_DOCKER_IMAGE_NAME, + help='The name of the docker image containing stress client ' + 'and stress servers') +argp.add_argument('--build_docker_image', + dest='build_docker_image', + action='store_true', + help='Build a docker image and push to Google Container ' + 'Registry') +argp.add_argument('--do_not_build_docker_image', + dest='build_docker_image', + action='store_false', + help='Do not build and push docker image to Google Container ' + 'Registry') +argp.set_defaults(build_docker_image=True) + +argp.add_argument('--test_poll_interval_secs', + default=_DEFAULT_TEST_POLL_INTERVAL_SECS, + type=int, + help='How frequently should this script should monitor the ' + 'health of stress clients and servers running in the GKE ' + 'cluster') +argp.add_argument('--test_duration_secs', + default=_DEFAULT_TEST_DURATION_SECS, + type=int, + help='How long should this test be run') +argp.add_argument('--kubernetes_proxy_port', + default=_DEFAULT_KUBERNETES_PROXY_PORT, + type=int, + help='The port on which the kubernetes proxy (on localhost)' + ' is started') +argp.add_argument('--stress_server_port', + default=_DEFAULT_STRESS_SERVER_PORT, + type=int, + help='The port on which the stress server (in GKE ' + 'containers) listens') +argp.add_argument('--stress_client_metrics_port', + default=_DEFAULT_METRICS_PORT, + type=int, + help='The port on which the stress clients (in GKE ' + 'containers) expose metrics') +argp.add_argument('--stress_client_poll_interval_secs', + default=_DEFAULT_STRESS_CLIENT_POLL_INTERVAL_SECS, + type=int, + help='How frequently should the stress client wrapper script' + ' running inside GKE should monitor health of the actual ' + ' stress client process and upload the metrics to BigQuery') +argp.add_argument('--stress_client_metrics_collection_interval_secs', + default=_DEFAULT_METRICS_COLLECTION_INTERVAL_SECS, + type=int, + help='How frequently should metrics be collected in-memory on' + ' the stress clients (running inside GKE containers). Note ' + 'that this is NOT the same as the upload-to-BigQuery ' + 'frequency. The metrics upload frequency is controlled by the' + ' --stress_client_poll_interval_secs flag') +argp.add_argument('--stress_client_num_channels_per_server', + default=_DEFAULT_NUM_CHANNELS_PER_SERVER, + type=int, + help='The number of channels created to each server from a ' + 'stress client') +argp.add_argument('--stress_client_num_stubs_per_channel', + default=_DEFAULT_NUM_STUBS_PER_CHANNEL, + type=int, + help='The number of stubs created per channel. This number ' + 'indicates the max number of RPCs that can be made in ' + 'parallel on each channel at any given time') +argp.add_argument('--stress_client_test_cases', + default=_DEFAULT_TEST_CASES_STR, + help='List of test cases (with weights) to be executed by the' + ' stress test client. The list is in the following format:\n' + ' ..\n' + ' (Note: The weights do not have to add up to 100)') + +if __name__ == '__main__': + args = argp.parse_args() + + test_settings = TestSettings( + args.build_docker_image, args.test_poll_interval_secs, + args.test_duration_secs, args.kubernetes_proxy_port) + + gke_settings = GkeSettings(args.project_id, args.docker_image_name) + + stress_server_settings = StressServerSettings(_SERVER_POD_NAME, + args.stress_server_port) + stress_client_settings = StressClientSettings( + args.num_clients, _CLIENT_POD_NAME_PREFIX, _SERVER_POD_NAME, + args.stress_server_port, args.stress_client_metrics_port, + args.stress_client_metrics_collection_interval_secs, + args.stress_client_poll_interval_secs, + args.stress_client_num_channels_per_server, + args.stress_client_num_stubs_per_channel, args.stress_client_test_cases) + + run_test_main(test_settings, gke_settings, stress_server_settings, + stress_client_settings) diff --git a/tools/run_tests/stress_test/stress_test_utils.py b/tools/run_tests/stress_test/stress_test_utils.py deleted file mode 100755 index 7adc0068f9..0000000000 --- a/tools/run_tests/stress_test/stress_test_utils.py +++ /dev/null @@ -1,197 +0,0 @@ -#!/usr/bin/env python2.7 -# Copyright 2015-2016, Google Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following disclaimer -# in the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Google Inc. nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import datetime -import json -import os -import re -import select -import subprocess -import sys -import time - -# Import big_query_utils module -bq_utils_dir = os.path.abspath(os.path.join( - os.path.dirname(__file__), '../../big_query')) -sys.path.append(bq_utils_dir) -import big_query_utils as bq_utils - - -class EventType: - STARTING = 'STARTING' - SUCCESS = 'SUCCESS' - FAILURE = 'FAILURE' - - -class BigQueryHelper: - """Helper class for the stress test wrappers to interact with BigQuery. - """ - - def __init__(self, run_id, image_type, pod_name, project_id, dataset_id, - summary_table_id, qps_table_id): - self.run_id = run_id - self.image_type = image_type - self.pod_name = pod_name - self.project_id = project_id - self.dataset_id = dataset_id - self.summary_table_id = summary_table_id - self.qps_table_id = qps_table_id - - def initialize(self): - self.bq = bq_utils.create_big_query() - - def setup_tables(self): - return bq_utils.create_dataset(self.bq, self.project_id, self.dataset_id) \ - and self.__create_summary_table() \ - and self.__create_qps_table() - - def insert_summary_row(self, event_type, details): - row_values_dict = { - 'run_id': self.run_id, - 'image_type': self.image_type, - 'pod_name': self.pod_name, - 'event_date': datetime.datetime.now().isoformat(), - 'event_type': event_type, - 'details': details - } - # row_unique_id is something that uniquely identifies the row (BigQuery uses - # it for duplicate detection). - row_unique_id = '%s_%s_%s' % (self.run_id, self.pod_name, event_type) - row = bq_utils.make_row(row_unique_id, row_values_dict) - return bq_utils.insert_rows(self.bq, self.project_id, self.dataset_id, - self.summary_table_id, [row]) - - def insert_qps_row(self, qps, recorded_at): - row_values_dict = { - 'run_id': self.run_id, - 'pod_name': self.pod_name, - 'recorded_at': recorded_at, - 'qps': qps - } - - # row_unique_id is something that uniquely identifies the row (BigQuery uses - # it for duplicate detection). - row_unique_id = '%s_%s_%s' % (self.run_id, self.pod_name, recorded_at) - row = bq_utils.make_row(row_unique_id, row_values_dict) - return bq_utils.insert_rows(self.bq, self.project_id, self.dataset_id, - self.qps_table_id, [row]) - - def check_if_any_tests_failed(self, num_query_retries=3): - query = ('SELECT event_type FROM %s.%s WHERE run_id = \'%s\' AND ' - 'event_type="%s"') % (self.dataset_id, self.summary_table_id, - self.run_id, EventType.FAILURE) - query_job = bq_utils.sync_query_job(self.bq, self.project_id, query) - page = self.bq.jobs().getQueryResults(**query_job['jobReference']).execute( - num_retries=num_query_retries) - num_failures = int(page['totalRows']) - print 'num rows: ', num_failures - return num_failures > 0 - - def print_summary_records(self, num_query_retries=3): - line = '-' * 120 - print line - print 'Summary records' - print 'Run Id: ', self.run_id - print 'Dataset Id: ', self.dataset_id - print line - query = ('SELECT pod_name, image_type, event_type, event_date, details' - ' FROM %s.%s WHERE run_id = \'%s\' ORDER by event_date;') % ( - self.dataset_id, self.summary_table_id, self.run_id) - query_job = bq_utils.sync_query_job(self.bq, self.project_id, query) - - print '{:<25} {:<12} {:<12} {:<30} {}'.format( - 'Pod name', 'Image type', 'Event type', 'Date', 'Details') - print line - page_token = None - while True: - page = self.bq.jobs().getQueryResults( - pageToken=page_token, - **query_job['jobReference']).execute(num_retries=num_query_retries) - rows = page.get('rows', []) - for row in rows: - print '{:<25} {:<12} {:<12} {:<30} {}'.format( - row['f'][0]['v'], row['f'][1]['v'], row['f'][2]['v'], - row['f'][3]['v'], row['f'][4]['v']) - page_token = page.get('pageToken') - if not page_token: - break - - def print_qps_records(self, num_query_retries=3): - line = '-' * 80 - print line - print 'QPS Summary' - print 'Run Id: ', self.run_id - print 'Dataset Id: ', self.dataset_id - print line - query = ( - 'SELECT pod_name, recorded_at, qps FROM %s.%s WHERE run_id = \'%s\' ' - 'ORDER by recorded_at;') % (self.dataset_id, self.qps_table_id, - self.run_id) - query_job = bq_utils.sync_query_job(self.bq, self.project_id, query) - print '{:<25} {:30} {}'.format('Pod name', 'Recorded at', 'Qps') - print line - page_token = None - while True: - page = self.bq.jobs().getQueryResults( - pageToken=page_token, - **query_job['jobReference']).execute(num_retries=num_query_retries) - rows = page.get('rows', []) - for row in rows: - print '{:<25} {:30} {}'.format(row['f'][0]['v'], row['f'][1]['v'], - row['f'][2]['v']) - page_token = page.get('pageToken') - if not page_token: - break - - def __create_summary_table(self): - summary_table_schema = [ - ('run_id', 'STRING', 'Test run id'), - ('image_type', 'STRING', 'Client or Server?'), - ('pod_name', 'STRING', 'GKE pod hosting this image'), - ('event_date', 'STRING', 'The date of this event'), - ('event_type', 'STRING', 'STARTED/SUCCESS/FAILURE'), - ('details', 'STRING', 'Any other relevant details') - ] - desc = ('The table that contains START/SUCCESS/FAILURE events for ' - ' the stress test clients and servers') - return bq_utils.create_table(self.bq, self.project_id, self.dataset_id, - self.summary_table_id, summary_table_schema, - desc) - - def __create_qps_table(self): - qps_table_schema = [ - ('run_id', 'STRING', 'Test run id'), - ('pod_name', 'STRING', 'GKE pod hosting this image'), - ('recorded_at', 'STRING', 'Metrics recorded at time'), - ('qps', 'INTEGER', 'Queries per second') - ] - desc = 'The table that cointains the qps recorded at various intervals' - return bq_utils.create_table(self.bq, self.project_id, self.dataset_id, - self.qps_table_id, qps_table_schema, desc) -- cgit v1.2.3 From fa4b163feac008bf7147cdad4f2dd88ef778ad2c Mon Sep 17 00:00:00 2001 From: Jan Tattermusch Date: Fri, 26 Feb 2016 17:14:01 -0800 Subject: windows C# distribtest --- test/distrib/csharp/DistribTest.sln | 6 ++++++ test/distrib/csharp/DistribTest/DistribTest.csproj | 20 ++++++++++++++++++++ test/distrib/csharp/run_distrib_test.bat | 21 +++++++++++++++++++++ test/distrib/csharp/run_distrib_test.sh | 4 +--- test/distrib/csharp/update_version.sh | 10 +++++++++- tools/run_tests/distribtest_targets.py | 11 ++++++++++- 6 files changed, 67 insertions(+), 5 deletions(-) create mode 100644 test/distrib/csharp/run_distrib_test.bat (limited to 'tools') diff --git a/test/distrib/csharp/DistribTest.sln b/test/distrib/csharp/DistribTest.sln index 0eca35c30f..78d5397ca9 100644 --- a/test/distrib/csharp/DistribTest.sln +++ b/test/distrib/csharp/DistribTest.sln @@ -8,13 +8,19 @@ EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU + Debug|x64 = Debug|x64 Release|Any CPU = Release|Any CPU + Release|x64 = Release|x64 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {A3E61CC3-3710-49A3-A830-A0066EDBCE2F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {A3E61CC3-3710-49A3-A830-A0066EDBCE2F}.Debug|Any CPU.Build.0 = Debug|Any CPU + {A3E61CC3-3710-49A3-A830-A0066EDBCE2F}.Debug|x64.ActiveCfg = Debug|x64 + {A3E61CC3-3710-49A3-A830-A0066EDBCE2F}.Debug|x64.Build.0 = Debug|x64 {A3E61CC3-3710-49A3-A830-A0066EDBCE2F}.Release|Any CPU.ActiveCfg = Release|Any CPU {A3E61CC3-3710-49A3-A830-A0066EDBCE2F}.Release|Any CPU.Build.0 = Release|Any CPU + {A3E61CC3-3710-49A3-A830-A0066EDBCE2F}.Release|x64.ActiveCfg = Release|x64 + {A3E61CC3-3710-49A3-A830-A0066EDBCE2F}.Release|x64.Build.0 = Release|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/test/distrib/csharp/DistribTest/DistribTest.csproj b/test/distrib/csharp/DistribTest/DistribTest.csproj index 124fc1bdf0..7605495f0f 100644 --- a/test/distrib/csharp/DistribTest/DistribTest.csproj +++ b/test/distrib/csharp/DistribTest/DistribTest.csproj @@ -32,6 +32,26 @@ prompt 4 + + true + bin\x64\Debug\ + DEBUG;TRACE + full + x64 + prompt + MinimumRecommendedRules.ruleset + true + + + bin\x64\Release\ + TRACE + true + pdbonly + x64 + prompt + MinimumRecommendedRules.ruleset + true + ..\packages\BouncyCastle.1.7.0\lib\Net40-Client\BouncyCastle.Crypto.dll diff --git a/test/distrib/csharp/run_distrib_test.bat b/test/distrib/csharp/run_distrib_test.bat new file mode 100644 index 0000000000..950894f668 --- /dev/null +++ b/test/distrib/csharp/run_distrib_test.bat @@ -0,0 +1,21 @@ + +@rem enter this directory +cd /d %~dp0 + +@rem extract input artifacts +powershell -Command "Add-Type -Assembly 'System.IO.Compression.FileSystem'; [System.IO.Compression.ZipFile]::ExtractToDirectory('../../../input_artifacts/csharp_nugets.zip', 'TestNugetFeed');" + +update_version.sh auto + +set NUGET=C:\nuget\nuget.exe +%NUGET% restore || goto :error + +@call build_vs2015.bat DistribTest.sln %MSBUILD_EXTRA_ARGS% || goto :error + +%DISTRIBTEST_OUTPATH%\DistribTest.exe || goto :error + +goto :EOF + +:error +echo Failed! +exit /b %errorlevel% diff --git a/test/distrib/csharp/run_distrib_test.sh b/test/distrib/csharp/run_distrib_test.sh index 1de62041b3..934174a9a4 100755 --- a/test/distrib/csharp/run_distrib_test.sh +++ b/test/distrib/csharp/run_distrib_test.sh @@ -34,9 +34,7 @@ cd $(dirname $0) unzip -o "$EXTERNAL_GIT_ROOT/input_artifacts/csharp_nugets.zip" -d TestNugetFeed -# Extract the version number from Grpc nuget package name. -CSHARP_VERSION=$(ls TestNugetFeed | grep '^Grpc\.[0-9].*\.nupkg$' | sed s/^Grpc\.// | sed s/\.nupkg$//) -./update_version.sh $CSHARP_VERSION +./update_version.sh auto nuget restore diff --git a/test/distrib/csharp/update_version.sh b/test/distrib/csharp/update_version.sh index f2554e8998..b0d07721f6 100755 --- a/test/distrib/csharp/update_version.sh +++ b/test/distrib/csharp/update_version.sh @@ -32,5 +32,13 @@ set -e cd $(dirname $0) +CSHARP_VERSION="$1" +if [ "$CSHARP_VERSION" == "auto" ] +then + # autodetect C# version + CSHARP_VERSION=$(ls TestNugetFeed | grep '^Grpc\.[0-9].*\.nupkg$' | sed s/^Grpc\.// | sed s/\.nupkg$//) + echo "Autodetected nuget ${CSHARP_VERSION}" +fi + # Replaces version placeholder with value provided as first argument. -sed -ibak "s/__GRPC_NUGET_VERSION__/$1/g" DistribTest/packages.config DistribTest/DistribTest.csproj +sed -ibak "s/__GRPC_NUGET_VERSION__/${CSHARP_VERSION}/g" DistribTest/packages.config DistribTest/DistribTest.csproj diff --git a/tools/run_tests/distribtest_targets.py b/tools/run_tests/distribtest_targets.py index 933103f0a0..fb951b68f9 100644 --- a/tools/run_tests/distribtest_targets.py +++ b/tools/run_tests/distribtest_targets.py @@ -97,7 +97,14 @@ class CSharpDistribTest(object): ['test/distrib/csharp/run_distrib_test.sh'], environ={'EXTERNAL_GIT_ROOT': '../../..'}) else: - raise Exception("Not supported yet.") + if self.arch == 'x64': + environ={'MSBUILD_EXTRA_ARGS': '/p:Platform=x64', + 'DISTRIBTEST_OUTPATH': 'DistribTest\\bin\\x64\\Debug'} + else: + environ={'DISTRIBTEST_OUTPATH': 'DistribTest\\bin\\\Debug'} + return create_jobspec(self.name, + ['test\\distrib\\csharp\\run_distrib_test.bat'], + environ=environ) def __str__(self): return self.name @@ -240,6 +247,8 @@ def targets(): CSharpDistribTest('linux', 'x64', 'ubuntu1510'), CSharpDistribTest('linux', 'x64', 'ubuntu1604'), CSharpDistribTest('macos', 'x86'), + CSharpDistribTest('windows', 'x86'), + CSharpDistribTest('windows', 'x64'), PythonDistribTest('linux', 'x64', 'wheezy'), PythonDistribTest('linux', 'x64', 'jessie'), PythonDistribTest('linux', 'x86', 'jessie'), -- cgit v1.2.3 From e1dd18a945ea11d1eba412ea0483b3996a222c3b Mon Sep 17 00:00:00 2001 From: Sree Kuchibhotla Date: Mon, 29 Feb 2016 13:28:55 -0800 Subject: Fix copyright --- test/cpp/interop/metrics_client.cc | 2 +- tools/gcp/utils/big_query_utils.py | 2 +- tools/gcp/utils/kubernetes_api.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/test/cpp/interop/metrics_client.cc b/test/cpp/interop/metrics_client.cc index cc304f2e89..bd48c7d4ef 100644 --- a/test/cpp/interop/metrics_client.cc +++ b/test/cpp/interop/metrics_client.cc @@ -1,6 +1,6 @@ /* * - * Copyright 2015, Google Inc. + * Copyright 2015-2016, Google Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/tools/gcp/utils/big_query_utils.py b/tools/gcp/utils/big_query_utils.py index e2379fd1aa..7bb1e14354 100755 --- a/tools/gcp/utils/big_query_utils.py +++ b/tools/gcp/utils/big_query_utils.py @@ -1,5 +1,5 @@ #!/usr/bin/env python2.7 -# Copyright 2015-2016 Google Inc. +# Copyright 2015-2016, Google Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tools/gcp/utils/kubernetes_api.py b/tools/gcp/utils/kubernetes_api.py index 2d3f771e93..e8ddd2f1b3 100755 --- a/tools/gcp/utils/kubernetes_api.py +++ b/tools/gcp/utils/kubernetes_api.py @@ -1,5 +1,5 @@ #!/usr/bin/env python2.7 -# Copyright 2015-2016 Google Inc. +# Copyright 2015-2016, Google Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without -- cgit v1.2.3 From 3f1aa9b99ae6752cc22cab2707b1d8c3846f21be Mon Sep 17 00:00:00 2001 From: Jan Tattermusch Date: Mon, 29 Feb 2016 15:15:23 -0800 Subject: add copyright and cleanup python code --- test/distrib/csharp/run_distrib_test.bat | 28 ++++++++++++++++++++++++++++ tools/run_tests/distribtest_targets.py | 4 +++- 2 files changed, 31 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/test/distrib/csharp/run_distrib_test.bat b/test/distrib/csharp/run_distrib_test.bat index 950894f668..67bfc58ac8 100644 --- a/test/distrib/csharp/run_distrib_test.bat +++ b/test/distrib/csharp/run_distrib_test.bat @@ -1,3 +1,31 @@ +@rem Copyright 2016, Google Inc. +@rem All rights reserved. +@rem +@rem Redistribution and use in source and binary forms, with or without +@rem modification, are permitted provided that the following conditions are +@rem met: +@rem +@rem * Redistributions of source code must retain the above copyright +@rem notice, this list of conditions and the following disclaimer. +@rem * Redistributions in binary form must reproduce the above +@rem copyright notice, this list of conditions and the following disclaimer +@rem in the documentation and/or other materials provided with the +@rem distribution. +@rem * Neither the name of Google Inc. nor the names of its +@rem contributors may be used to endorse or promote products derived from +@rem this software without specific prior written permission. +@rem +@rem THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +@rem "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +@rem LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +@rem A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +@rem OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +@rem SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +@rem LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +@rem DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +@rem THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +@rem (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +@rem OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. @rem enter this directory cd /d %~dp0 diff --git a/tools/run_tests/distribtest_targets.py b/tools/run_tests/distribtest_targets.py index fb951b68f9..34cc1cd710 100644 --- a/tools/run_tests/distribtest_targets.py +++ b/tools/run_tests/distribtest_targets.py @@ -96,7 +96,7 @@ class CSharpDistribTest(object): return create_jobspec(self.name, ['test/distrib/csharp/run_distrib_test.sh'], environ={'EXTERNAL_GIT_ROOT': '../../..'}) - else: + elif self.platform == 'windows': if self.arch == 'x64': environ={'MSBUILD_EXTRA_ARGS': '/p:Platform=x64', 'DISTRIBTEST_OUTPATH': 'DistribTest\\bin\\x64\\Debug'} @@ -105,6 +105,8 @@ class CSharpDistribTest(object): return create_jobspec(self.name, ['test\\distrib\\csharp\\run_distrib_test.bat'], environ=environ) + else: + raise Exception("Not supported yet.") def __str__(self): return self.name -- cgit v1.2.3 From 494f3128331adcf601903b00ea62b767d99f84c2 Mon Sep 17 00:00:00 2001 From: Jan Tattermusch Date: Tue, 1 Mar 2016 13:55:42 -0800 Subject: fix reporting for multiple test runs --- tools/run_tests/jobset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/run_tests/jobset.py b/tools/run_tests/jobset.py index adf178bb3c..a3b246dc08 100755 --- a/tools/run_tests/jobset.py +++ b/tools/run_tests/jobset.py @@ -384,7 +384,8 @@ class Jobset(object): self._travis, self._add_env) self._running.add(job) - self.resultset[job.GetSpec().shortname] = [] + if not self.resultset.has_key(job.GetSpec().shortname): + self.resultset[job.GetSpec().shortname] = [] return True def reap(self): -- cgit v1.2.3